From a99c3ad7776b60a65d552345d1bd84c45180bf2d Mon Sep 17 00:00:00 2001 From: sgolebiewski-intel Date: Mon, 17 Mar 2025 16:45:49 +0100 Subject: [PATCH] [DOCS] Removal of notebooks from docs for 25.0 Signed-off-by: sgolebiewski-intel --- README.md | 2 +- docs/CMakeLists.txt | 9 - .../openvino-integrations.rst | 2 +- docs/articles_en/get-started.rst | 19 +- .../install-openvino-archive-linux.rst | 8 +- .../install-openvino-archive-macos.rst | 8 +- .../install-openvino-archive-windows.rst | 8 +- .../install-openvino-genai.rst | 4 +- .../install-openvino/install-openvino-pip.rst | 8 +- .../interactive-tutorials-python.rst | 36 +- .../notebooks-installation.rst | 408 +- .../run-notebooks.rst | 135 + .../compressing-models-during-training.rst | 2 +- .../inference-devices-and-modes.rst | 8 +- .../auto-device-selection.rst | 4 +- .../gpu-device.rst | 2 +- .../npu-device.rst | 4 +- ...tegrate-openvino-with-your-application.rst | 2 +- .../layout-api-overview.rst | 2 +- .../running-inference/stateful-models.rst | 2 +- .../obtaining-stateful-openvino-model.rst | 2 +- docs/nbdoc/README.md | 48 - docs/nbdoc/__init__.py | 0 docs/nbdoc/consts.py | 48 - docs/nbdoc/nbdoc.py | 154 - docs/nbdoc/requirements.txt | 3 - docs/nbdoc/utils.py | 57 - .../3D-pose-estimation-with-output.rst | 689 --- ...-segmentation-point-clouds-with-output.rst | 333 -- ...entation-point-clouds-with-output_11_2.png | 3 - ...entation-point-clouds-with-output_16_1.png | 3 - .../action-recognition-webcam-with-output.rst | 710 --- ...on-recognition-webcam-with-output_22_0.png | 3 - docs/notebooks/all_notebooks_paths.txt | 177 - docs/notebooks/animate-anyone-with-output.rst | 1266 ----- docs/notebooks/async-api-with-output.rst | 659 --- .../async-api-with-output_17_0.png | 3 - .../async-api-with-output_21_0.png | 3 - .../async-api-with-output_23_0.png | 3 - .../async-api-with-output_29_0.png | 3 - docs/notebooks/auto-device-with-output.rst | 707 --- .../auto-device-with-output_14_1.jpg | 3 - .../auto-device-with-output_14_1.png | 3 - .../auto-device-with-output_27_0.png | 3 - .../auto-device-with-output_28_0.png | 3 - .../bark-text-to-audio-with-output.rst | 1043 ---- ...visual-language-processing-with-output.rst | 1408 ------ ...l-language-processing-with-output_25_0.png | 3 - ...l-language-processing-with-output_27_0.png | 3 - ...l-language-processing-with-output_47_0.png | 3 - ...l-language-processing-with-output_49_0.png | 3 - ...al-language-processing-with-output_7_0.png | 3 - docs/notebooks/catvton-with-output.rst | 387 -- ...clip-language-saliency-map-with-output.rst | 928 ---- ...language-saliency-map-with-output_15_0.png | 3 - ...language-saliency-map-with-output_17_0.png | 3 - ...language-saliency-map-with-output_19_1.png | 3 - ...language-saliency-map-with-output_29_1.png | 3 - ...language-saliency-map-with-output_35_1.png | 3 - ...p-zero-shot-classification-with-output.rst | 793 --- ...o-shot-classification-with-output_13_0.png | 3 - ...o-shot-classification-with-output_26_0.png | 3 - ...ro-shot-classification-with-output_6_0.png | 3 - ...ontrolnet-stable-diffusion-with-output.rst | 1826 ------- ...lnet-stable-diffusion-with-output_17_0.png | 3 - ...lnet-stable-diffusion-with-output_34_1.jpg | 3 - ...lnet-stable-diffusion-with-output_34_1.png | 3 - ...lnet-stable-diffusion-with-output_50_0.png | 3 - ...olnet-stable-diffusion-with-output_8_0.png | 3 - .../convert-to-openvino-with-output.rst | 871 ---- .../convnext-classification-with-output.rst | 250 - ...segmentation-quantize-nncf-with-output.rst | 1023 ---- ...ntation-quantize-nncf-with-output_14_1.png | 3 - ...ntation-quantize-nncf-with-output_37_1.png | 3 - ...ntation-quantize-nncf-with-output_42_0.jpg | 3 - ...ddcolor-image-colorization-with-output.rst | 734 --- ...or-image-colorization-with-output_10_0.jpg | 3 - ...or-image-colorization-with-output_10_0.png | 3 - ...or-image-colorization-with-output_17_0.jpg | 3 - ...or-image-colorization-with-output_17_0.png | 3 - ...or-image-colorization-with-output_26_0.jpg | 3 - ...or-image-colorization-with-output_26_0.png | 3 - ...lor-image-colorization-with-output_9_0.jpg | 3 - ...lor-image-colorization-with-output_9_0.png | 3 - docs/notebooks/deepseek-r1-with-output.rst | 500 -- .../depth-anything-v2-with-output.rst | 1122 ----- .../depth-anything-v2-with-output_15_1.png | 3 - .../depth-anything-v2-with-output_25_1.png | 3 - .../depth-anything-v2-with-output_44_0.png | 3 - .../depth-anything-v2-with-output_9_1.jpg | 3 - .../depth-anything-v2-with-output_9_1.png | 3 - docs/notebooks/depth-anything-with-output.rst | 1080 ---- .../depth-anything-with-output_11_1.jpg | 3 - .../depth-anything-with-output_11_1.png | 3 - .../depth-anything-with-output_18_0.png | 3 - .../depth-anything-with-output_27_0.png | 3 - .../depth-anything-with-output_46_0.png | 3 - .../detectron2-to-openvino-with-output.rst | 623 --- ...etectron2-to-openvino-with-output_22_0.jpg | 3 - ...etectron2-to-openvino-with-output_22_0.png | 3 - ...etectron2-to-openvino-with-output_32_0.jpg | 3 - ...etectron2-to-openvino-with-output_32_0.png | 3 - ...detectron2-to-openvino-with-output_8_0.jpg | 3 - ...detectron2-to-openvino-with-output_8_0.png | 3 - .../distil-whisper-asr-with-output.rst | 1112 ----- ...micrafter-animating-images-with-output.rst | 1558 ------ docs/notebooks/efficient-sam-with-output.rst | 1343 ----- .../efficient-sam-with-output_12_0.jpg | 3 - .../efficient-sam-with-output_12_0.png | 3 - .../efficient-sam-with-output_17_0.png | 3 - .../efficient-sam-with-output_17_1.png | 3 - .../efficient-sam-with-output_25_0.png | 3 - .../efficient-sam-with-output_25_1.png | 3 - .../efficient-sam-with-output_36_0.png | 3 - .../efficient-sam-with-output_36_1.png | 3 - .../encodec-audio-compression-with-output.rst | 637 --- ...dec-audio-compression-with-output_19_1.png | 3 - ...dec-audio-compression-with-output_38_1.png | 3 - ...odec-audio-compression-with-output_6_2.png | 3 - .../explainable-ai-1-basic-with-output.rst | 300 -- ...xplainable-ai-1-basic-with-output_11_2.png | 3 - ...xplainable-ai-1-basic-with-output_19_0.png | 3 - ...explainable-ai-2-deep-dive-with-output.rst | 907 ---- ...inable-ai-2-deep-dive-with-output_11_2.png | 3 - ...inable-ai-2-deep-dive-with-output_22_1.png | 3 - ...inable-ai-2-deep-dive-with-output_25_1.png | 3 - ...inable-ai-2-deep-dive-with-output_34_1.png | 3 - ...inable-ai-2-deep-dive-with-output_35_0.png | 3 - ...inable-ai-2-deep-dive-with-output_36_0.png | 3 - ...inable-ai-2-deep-dive-with-output_49_0.png | 3 - ...inable-ai-2-deep-dive-with-output_52_0.png | 3 - ...inable-ai-2-deep-dive-with-output_63_1.png | 3 - ...le-ai-3-map-interpretation-with-output.rst | 839 ---- ...-3-map-interpretation-with-output_50_0.png | 3 - ...-3-map-interpretation-with-output_53_0.png | 3 - ...-3-map-interpretation-with-output_56_0.png | 3 - ...-3-map-interpretation-with-output_59_0.png | 3 - .../fast-segment-anything-with-output.rst | 907 ---- ...fast-segment-anything-with-output_21_0.jpg | 3 - ...fast-segment-anything-with-output_21_0.png | 3 - .../fast-segment-anything-with-output_9_0.jpg | 3 - .../fast-segment-anything-with-output_9_0.png | 3 - docs/notebooks/film-slowmo-with-output.rst | 523 -- .../film-slowmo-with-output_14_0.png | 3 - .../film-slowmo-with-output_29_0.png | 3 - .../film-slowmo-with-output_7_0.png | 3 - docs/notebooks/florence2-with-output.rst | 645 --- .../florence2-with-output_15_0.jpg | 3 - .../florence2-with-output_15_0.png | 3 - .../florence2-with-output_18_0.png | 3 - .../flux.1-image-generation-with-output.rst | 387 -- ...ux.1-image-generation-with-output_16_1.jpg | 3 - ...ux.1-image-generation-with-output_16_1.png | 3 - .../freevc-voice-conversion-with-output.rst | 770 --- docs/notebooks/glm-edge-v-with-output.rst | 518 -- .../glm-edge-v-with-output_12_1.jpg | 3 - .../glm-edge-v-with-output_12_1.png | 3 - docs/notebooks/gpu-device-with-output.rst | 1452 ------ .../grammar-correction-with-output.rst | 826 ---- .../grounded-segment-anything-with-output.rst | 955 ---- ...nded-segment-anything-with-output_30_0.jpg | 3 - ...nded-segment-anything-with-output_30_0.png | 3 - ...nded-segment-anything-with-output_46_0.jpg | 3 - ...nded-segment-anything-with-output_46_0.png | 3 - .../notebooks/handwritten-ocr-with-output.rst | 434 -- .../handwritten-ocr-with-output_22_0.png | 3 - .../handwritten-ocr-with-output_32_1.png | 3 - .../notebooks/hello-detection-with-output.rst | 274 - .../hello-detection-with-output_11_1.png | 3 - .../hello-detection-with-output_16_0.png | 3 - docs/notebooks/hello-npu-with-output.rst | 965 ---- .../hello-segmentation-with-output.rst | 283 -- .../hello-segmentation-with-output_11_2.png | 3 - .../hello-segmentation-with-output_13_1.png | 3 - .../hello-segmentation-with-output_17_0.png | 3 - docs/notebooks/hello-world-with-output.rst | 233 - .../hello-world-with-output_11_1.png | 3 - .../hugging-face-hub-with-output.rst | 565 --- ...nyuan-dit-image-generation-with-output.rst | 945 ---- ...-dit-image-generation-with-output_31_0.jpg | 3 - ...-dit-image-generation-with-output_31_0.png | 3 - ...lassification-quantization-with-output.rst | 688 --- ...fication-quantization-with-output_31_2.png | 3 - .../image-to-image-genai-with-output.rst | 518 -- .../image-to-image-genai-with-output_11_0.jpg | 3 - .../image-to-image-genai-with-output_11_0.png | 3 - .../image-to-image-genai-with-output_13_1.jpg | 3 - .../image-to-image-genai-with-output_13_1.png | 3 - .../image-to-image-genai-with-output_15_0.jpg | 3 - .../image-to-image-genai-with-output_15_0.png | 3 - .../image-to-image-genai-with-output_16_1.jpg | 3 - .../image-to-image-genai-with-output_16_1.png | 3 - .../inpainting-genai-with-output.rst | 580 --- .../inpainting-genai-with-output_10_0.jpg | 3 - .../inpainting-genai-with-output_10_0.png | 3 - .../inpainting-genai-with-output_12_1.jpg | 3 - .../inpainting-genai-with-output_12_1.png | 3 - .../inpainting-genai-with-output_14_0.jpg | 3 - .../inpainting-genai-with-output_14_0.png | 3 - .../inpainting-genai-with-output_15_1.jpg | 3 - .../inpainting-genai-with-output_15_1.png | 3 - .../inpainting-genai-with-output_18_2.jpg | 3 - .../inpainting-genai-with-output_18_2.png | 3 - .../inpainting-genai-with-output_21_1.jpg | 3 - .../inpainting-genai-with-output_21_1.png | 3 - docs/notebooks/instant-id-with-output.rst | 4391 ----------------- .../instant-id-with-output_16_0.jpg | 3 - .../instant-id-with-output_16_0.png | 3 - .../instant-id-with-output_17_0.jpg | 3 - .../instant-id-with-output_17_0.png | 3 - .../instant-id-with-output_40_0.jpg | 3 - .../instant-id-with-output_40_0.png | 3 - .../instant-id-with-output_66_0.png | 3 - ...ruct-pix2pix-image-editing-with-output.rst | 1377 ------ ...pix2pix-image-editing-with-output_24_0.png | 3 - ...pix2pix-image-editing-with-output_36_2.png | 3 - docs/notebooks/internvl2-with-output.rst | 509 -- .../internvl2-with-output_14_0.jpg | 3 - .../internvl2-with-output_14_0.png | 3 - ...anus-multimodal-generation-with-output.rst | 643 --- ...multimodal-generation-with-output_14_1.jpg | 3 - ...multimodal-generation-with-output_14_1.png | 3 - ...multimodal-generation-with-output_18_0.jpg | 3 - ...multimodal-generation-with-output_18_0.png | 3 - ...classification-to-openvino-with-output.rst | 376 -- ...ification-to-openvino-with-output_16_0.jpg | 3 - ...ification-to-openvino-with-output_16_0.png | 3 - docs/notebooks/jina-clip-with-output.rst | 1001 ---- .../jina-clip-with-output_11_0.png | 3 - .../jina-clip-with-output_21_0.png | 3 - .../jina-clip-with-output_37_0.png | 3 - .../knowledge-graphs-conve-with-output.rst | 727 --- ...modal-large-language-model-with-output.rst | 1495 ------ ...-large-language-model-with-output_29_1.jpg | 3 - ...-large-language-model-with-output_29_1.png | 3 - ...-large-language-model-with-output_48_1.png | 3 - ...l-large-language-model-with-output_8_0.jpg | 3 - ...l-large-language-model-with-output_8_0.png | 3 - .../language-quantize-bert-with-output.rst | 712 --- ...cy-models-image-generation-with-output.rst | 847 ---- ...dels-image-generation-with-output_13_0.jpg | 3 - ...dels-image-generation-with-output_13_0.png | 3 - ...dels-image-generation-with-output_27_1.jpg | 3 - ...dels-image-generation-with-output_27_1.png | 3 - ...dels-image-generation-with-output_37_0.jpg | 3 - ...dels-image-generation-with-output_37_0.png | 3 - .../lcm-lora-controlnet-with-output.rst | 1763 ------- .../lcm-lora-controlnet-with-output_10_1.png | 3 - .../lcm-lora-controlnet-with-output_27_2.jpg | 3 - .../lcm-lora-controlnet-with-output_27_2.png | 3 - .../lcm-lora-controlnet-with-output_28_0.png | 3 - .../lcm-lora-controlnet-with-output_42_1.png | 3 - ...a-multimodal-chatbot-genai-with-output.rst | 528 -- ...timodal-chatbot-genai-with-output_21_0.jpg | 3 - ...timodal-chatbot-genai-with-output_21_0.png | 3 - ...multimodal-chatbot-optimum-with-output.rst | 524 -- ...modal-chatbot-optimum-with-output_20_0.jpg | 3 - ...modal-chatbot-optimum-with-output_20_0.png | 3 - ...va-next-multimodal-chatbot-with-output.rst | 454 -- ...xt-multimodal-chatbot-with-output_18_0.jpg | 3 - ...xt-multimodal-chatbot-with-output_18_0.png | 3 - ...lm-agent-functioncall-qwen-with-output.rst | 527 -- .../llm-agent-rag-llamaindex-with-output.rst | 514 -- .../llm-agent-react-langchain-with-output.rst | 838 ---- .../notebooks/llm-agent-react-with-output.rst | 687 --- .../llm-chatbot-generate-api-with-output.rst | 1112 ----- docs/notebooks/llm-chatbot-with-output.rst | 1414 ------ .../llm-question-answering-with-output.rst | 877 ---- .../llm-rag-langchain-genai-with-output.rst | 1374 ------ .../llm-rag-langchain-with-output.rst | 1441 ------ .../llm-rag-llamaindex-with-output.rst | 1386 ------ docs/notebooks/localai-with-output.rst | 231 - docs/notebooks/ltx-video-with-output.rst | 502 -- ...a-content-type-recognition-with-output.rst | 387 -- docs/notebooks/meter-reader-with-output.rst | 832 ---- .../meter-reader-with-output_16_1.png | 3 - .../meter-reader-with-output_18_1.png | 3 - .../meter-reader-with-output_20_1.png | 3 - .../meter-reader-with-output_22_1.png | 3 - .../meter-reader-with-output_24_1.png | 3 - ...nicpm-v-multimodal-chatbot-with-output.rst | 390 -- ...-v-multimodal-chatbot-with-output_12_1.jpg | 3 - ...-v-multimodal-chatbot-with-output_12_1.png | 3 - docs/notebooks/mllama-3.2-with-output.rst | 832 ---- .../mllama-3.2-with-output_19_1.jpg | 3 - .../mllama-3.2-with-output_19_1.png | 3 - ...sively-multilingual-speech-with-output.rst | 961 ---- .../mobileclip-video-search-with-output.rst | 974 ---- ...bileclip-video-search-with-output_12_0.png | 3 - ...bileclip-video-search-with-output_17_1.png | 3 - ...bileclip-video-search-with-output_28_1.png | 3 - docs/notebooks/model-server-with-output.rst | 960 ---- .../model-server-with-output_23_2.png | 3 - .../model-server-with-output_28_1.png | 3 - .../modelscope-to-openvino-with-output.rst | 565 --- ...odelscope-to-openvino-with-output_12_0.jpg | 3 - ...odelscope-to-openvino-with-output_12_0.png | 3 - ...multilora-image-generation-with-output.rst | 473 -- ...lora-image-generation-with-output_15_0.jpg | 3 - ...lora-image-generation-with-output_15_0.png | 3 - ...lora-image-generation-with-output_18_0.jpg | 3 - ...lora-image-generation-with-output_18_0.png | 3 - ...lora-image-generation-with-output_21_0.jpg | 3 - ...lora-image-generation-with-output_21_0.png | 3 - .../multimodal-rag-llamaindex-with-output.rst | 840 ---- ...imodal-rag-llamaindex-with-output_29_3.png | 3 - .../music-generation-with-output.rst | 676 --- .../named-entity-recognition-with-output.rst | 417 -- ...o-llava-multimodal-chatbot-with-output.rst | 663 --- ...va-multimodal-chatbot-with-output_22_0.jpg | 3 - ...va-multimodal-chatbot-with-output_22_0.png | 3 - .../notebooks_with_binder_buttons.txt | 38 - .../notebooks_with_colab_buttons.txt | 67 - ...tract-structure-extraction-with-output.rst | 495 -- .../object-detection-with-output.rst | 329 -- .../object-detection-with-output_13_0.png | 3 - docs/notebooks/omnigen-with-output.rst | 363 -- .../omnigen-with-output_14_1.jpg | 3 - .../omnigen-with-output_14_1.png | 3 - .../omnigen-with-output_16_1.jpg | 3 - .../omnigen-with-output_16_1.png | 3 - docs/notebooks/omniparser-with-output.rst | 705 --- .../omniparser-with-output_32_0.jpg | 3 - .../omniparser-with-output_32_0.png | 3 - .../oneformer-segmentation-with-output.rst | 745 --- ...neformer-segmentation-with-output_23_1.jpg | 3 - ...neformer-segmentation-with-output_23_1.png | 3 - ...neformer-segmentation-with-output_27_0.jpg | 3 - ...neformer-segmentation-with-output_27_0.png | 3 - ...neformer-segmentation-with-output_39_1.jpg | 3 - ...neformer-segmentation-with-output_39_1.png | 3 - ...neformer-segmentation-with-output_39_3.jpg | 3 - ...neformer-segmentation-with-output_39_3.png | 3 - docs/notebooks/openvino-api-with-output.rst | 1017 ---- .../openvino-tokenizers-with-output.rst | 570 --- docs/notebooks/openvoice-with-output.rst | 1067 ---- ...ical-character-recognition-with-output.rst | 481 -- ...character-recognition-with-output_13_0.png | 3 - ...character-recognition-with-output_23_0.png | 3 - ...character-recognition-with-output_25_0.jpg | 3 - ...character-recognition-with-output_25_0.png | 3 - ...haracter-recognition-with-output_25_10.jpg | 3 - ...haracter-recognition-with-output_25_10.png | 3 - ...character-recognition-with-output_25_2.jpg | 3 - ...character-recognition-with-output_25_2.png | 3 - ...character-recognition-with-output_25_4.jpg | 3 - ...character-recognition-with-output_25_4.png | 3 - ...character-recognition-with-output_25_6.jpg | 3 - ...character-recognition-with-output_25_6.png | 3 - ...character-recognition-with-output_25_8.jpg | 3 - ...character-recognition-with-output_25_8.png | 3 - .../optimize-preprocessing-with-output.rst | 643 --- ...ptimize-preprocessing-with-output_14_1.png | 3 - .../outetts-text-to-speech-with-output.rst | 374 -- .../paddle-ocr-webcam-with-output.rst | 668 --- ...to-openvino-classification-with-output.rst | 505 -- ...envino-classification-with-output_14_3.png | 3 - ...envino-classification-with-output_22_1.png | 3 - ...envino-classification-with-output_26_1.png | 3 - ...envino-classification-with-output_29_1.png | 3 - ...penvino-classification-with-output_7_1.png | 3 - .../parler-tts-text-to-speech-with-output.rst | 424 -- .../notebooks/person-counting-with-output.rst | 277 -- .../person-counting-with-output_14_0.png | 3 - .../notebooks/person-tracking-with-output.rst | 687 --- docs/notebooks/phi-3-vision-with-output.rst | 456 -- .../phi-3-vision-with-output_14_1.jpg | 3 - .../phi-3-vision-with-output_14_1.png | 3 - docs/notebooks/photo-maker-with-output.rst | 841 ---- .../photo-maker-with-output_34_0.png | 3 - .../pix2struct-docvqa-with-output.rst | 321 -- .../pix2struct-docvqa-with-output_11_0.jpg | 3 - .../pix2struct-docvqa-with-output_11_0.png | 3 - docs/notebooks/pixart-with-output.rst | 680 --- .../pixart-with-output_27_0.jpg | 3 - .../pixart-with-output_27_0.png | 3 - .../pixart-with-output_40_2.png | 3 - .../pixart-with-output_6_0.jpg | 3 - .../pixart-with-output_6_0.png | 3 - docs/notebooks/pixtral-with-output.rst | 281 -- .../pixtral-with-output_12_1.jpg | 3 - .../pixtral-with-output_12_1.png | 3 - .../notebooks/pose-estimation-with-output.rst | 827 ---- .../pytorch-onnx-to-openvino-with-output.rst | 624 --- ...orch-onnx-to-openvino-with-output_22_0.png | 3 - ...orch-onnx-to-openvino-with-output_27_0.png | 3 - ...orch-onnx-to-openvino-with-output_29_0.png | 3 - ...training-quantization-nncf-with-output.rst | 740 --- ...uantization-aware-training-with-output.rst | 823 --- ...on-sparsity-aware-training-with-output.rst | 536 -- .../pytorch-to-openvino-with-output.rst | 848 ---- .../pytorch-to-openvino-with-output_11_0.png | 3 - .../pytorch-to-openvino-with-output_20_0.png | 3 - .../pytorch-to-openvino-with-output_31_0.png | 3 - .../pytorch-to-openvino-with-output_35_0.png | 3 - .../pytorch-to-openvino-with-output_39_0.png | 3 - .../pytorch-to-openvino-with-output_43_0.png | 3 - .../pytorch-to-openvino-with-output_47_0.png | 3 - docs/notebooks/qrcode-monster-with-output.rst | 1500 ------ .../qrcode-monster-with-output_22_1.jpg | 3 - .../qrcode-monster-with-output_22_1.png | 3 - .../qrcode-monster-with-output_39_0.png | 3 - docs/notebooks/qwen2-audio-with-output.rst | 475 -- docs/notebooks/qwen2-vl-with-output.rst | 509 -- .../qwen2-vl-with-output_15_0.jpg | 3 - .../qwen2-vl-with-output_15_0.png | 3 - .../riffusion-text-to-music-with-output.rst | 557 --- ...ffusion-text-to-music-with-output_14_0.jpg | 3 - ...ffusion-text-to-music-with-output_14_0.png | 3 - .../rmbg-background-removal-with-output.rst | 318 -- ...bg-background-removal-with-output_14_0.png | 3 - ...mbg-background-removal-with-output_8_0.png | 3 - ...ce-text-to-video-retrieval-with-output.rst | 523 -- .../sana-image-generation-with-output.rst | 352 -- ...sana-image-generation-with-output_15_1.jpg | 3 - ...sana-image-generation-with-output_15_1.png | 3 - docs/notebooks/sdxl-turbo-with-output.rst | 894 ---- .../sdxl-turbo-with-output_11_1.jpg | 3 - .../sdxl-turbo-with-output_11_1.png | 3 - .../sdxl-turbo-with-output_17_1.jpg | 3 - .../sdxl-turbo-with-output_17_1.png | 3 - .../sdxl-turbo-with-output_29_3.jpg | 3 - .../sdxl-turbo-with-output_29_3.png | 3 - .../sdxl-turbo-with-output_30_3.jpg | 3 - .../sdxl-turbo-with-output_30_3.png | 3 - .../segment-anything-2-image-with-output.rst | 1376 ------ ...ment-anything-2-image-with-output_12_0.png | 3 - ...ment-anything-2-image-with-output_16_0.png | 3 - ...ment-anything-2-image-with-output_18_0.png | 3 - ...ment-anything-2-image-with-output_33_0.png | 3 - ...ment-anything-2-image-with-output_39_0.png | 3 - ...ment-anything-2-image-with-output_47_0.png | 3 - ...ment-anything-2-image-with-output_51_0.png | 3 - ...ment-anything-2-image-with-output_56_0.png | 3 - ...ment-anything-2-image-with-output_60_0.png | 3 - ...ment-anything-2-image-with-output_65_0.png | 3 - ...ment-anything-2-image-with-output_77_1.jpg | 3 - ...ment-anything-2-image-with-output_77_1.png | 3 - ...ment-anything-2-image-with-output_92_0.png | 3 - .../segment-anything-2-video-with-output.rst | 887 ---- ...ment-anything-2-video-with-output_40_1.png | 3 - ...ment-anything-2-video-with-output_46_0.png | 3 - .../segment-anything-with-output.rst | 1705 ------- .../segment-anything-with-output_21_0.png | 3 - .../segment-anything-with-output_28_0.png | 3 - .../segment-anything-with-output_35_0.png | 3 - .../segment-anything-with-output_39_0.png | 3 - .../segment-anything-with-output_44_0.png | 3 - .../segment-anything-with-output_48_0.png | 3 - .../segment-anything-with-output_53_0.png | 3 - .../segment-anything-with-output_69_1.jpg | 3 - .../segment-anything-with-output_69_1.png | 3 - .../segment-anything-with-output_81_0.png | 3 - .../segment-anything-with-output_83_1.jpg | 3 - .../segment-anything-with-output_83_1.png | 3 - ...-shot-image-classification-with-output.rst | 635 --- ...-image-classification-with-output_13_1.png | 3 - ...-image-classification-with-output_24_1.png | 3 - ...t-image-classification-with-output_6_1.png | 3 - ...tch-to-image-pix2pix-turbo-with-output.rst | 614 --- ...o-image-pix2pix-turbo-with-output_15_0.jpg | 3 - ...o-image-pix2pix-turbo-with-output_15_0.png | 3 - ...o-image-pix2pix-turbo-with-output_19_0.jpg | 3 - ...o-image-pix2pix-turbo-with-output_19_0.png | 3 - .../softvc-voice-conversion-with-output.rst | 340 -- ...sound-generation-audioldm2-with-output.rst | 855 ---- .../sparsity-optimization-with-output.rst | 202 - .../speculative-sampling-with-output.rst | 516 -- ...tion-quantization-wav2vec2-with-output.rst | 478 -- ...hbrain-emotion-recognition-with-output.rst | 348 -- docs/notebooks/stable-audio-with-output.rst | 621 --- ...e-cascade-image-generation-with-output.rst | 551 --- ...cade-image-generation-with-output_29_2.jpg | 3 - ...cade-image-generation-with-output_29_2.png | 3 - ...scade-image-generation-with-output_8_2.jpg | 3 - ...scade-image-generation-with-output_8_2.png | 3 - ...table-diffusion-ip-adapter-with-output.rst | 1155 ----- ...-diffusion-ip-adapter-with-output_22_1.png | 3 - ...-diffusion-ip-adapter-with-output_25_0.png | 3 - ...-diffusion-ip-adapter-with-output_28_0.png | 3 - .../stable-diffusion-keras-cv-with-output.rst | 1339 ----- ...le-diffusion-keras-cv-with-output_23_1.png | 3 - ...le-diffusion-keras-cv-with-output_38_0.png | 3 - ...le-diffusion-text-to-image-with-output.rst | 645 --- ...ffusion-text-to-image-with-output_17_1.jpg | 3 - ...ffusion-text-to-image-with-output_17_1.png | 3 - ...ffusion-text-to-image-with-output_27_1.jpg | 3 - ...ffusion-text-to-image-with-output_27_1.png | 3 - ...ffusion-text-to-image-with-output_31_1.jpg | 3 - ...ffusion-text-to-image-with-output_31_1.png | 3 - ...fusion-torchdynamo-backend-with-output.rst | 276 -- ...n-torchdynamo-backend-with-output_14_1.jpg | 3 - ...n-torchdynamo-backend-with-output_14_1.png | 3 - ...diffusion-v2-infinite-zoom-with-output.rst | 483 -- ...-diffusion-v2-optimum-demo-with-output.rst | 193 - ...usion-v2-optimum-demo-with-output_13_1.jpg | 3 - ...usion-v2-optimum-demo-with-output_13_1.png | 3 - ...sion-v2-text-to-image-demo-with-output.rst | 371 -- ...v2-text-to-image-demo-with-output_24_0.jpg | 3 - ...v2-text-to-image-demo-with-output_24_0.png | 3 - ...diffusion-v2-text-to-image-with-output.rst | 1342 ----- ...able-diffusion-v3-torch-fx-with-output.rst | 561 --- .../stable-diffusion-v3-with-output.rst | 646 --- .../stable-diffusion-v3-with-output_16_1.jpg | 3 - .../stable-diffusion-v3-with-output_16_1.png | 3 - .../stable-diffusion-v3-with-output_30_1.png | 3 - .../stable-diffusion-xl-with-output.rst | 404 -- .../stable-diffusion-xl-with-output_12_1.jpg | 3 - .../stable-diffusion-xl-with-output_12_1.png | 3 - .../stable-diffusion-xl-with-output_20_1.jpg | 3 - .../stable-diffusion-xl-with-output_20_1.png | 3 - docs/notebooks/stable-fast-3d-with-output.rst | 313 -- .../stable-video-diffusion-with-output.rst | 1408 ------ docs/notebooks/style-transfer-with-output.rst | 476 -- ...-line-level-text-detection-with-output.rst | 562 --- ...-level-text-detection-with-output_13_1.jpg | 3 - ...-level-text-detection-with-output_13_1.png | 3 - ...-level-text-detection-with-output_24_1.jpg | 3 - ...-level-text-detection-with-output_24_1.png | 3 - ...e-level-text-detection-with-output_3_0.jpg | 3 - ...e-level-text-detection-with-output_3_0.png | 3 - ...e-level-text-detection-with-output_6_6.jpg | 3 - ...e-level-text-detection-with-output_6_6.png | 3 - .../table-question-answering-with-output.rst | 363 -- ...fication-nncf-quantization-with-output.rst | 578 --- ...ion-nncf-quantization-with-output_10_1.png | 3 - ...ion-nncf-quantization-with-output_27_1.png | 3 - ...tion-nncf-quantization-with-output_9_1.png | 3 - ...classification-to-openvino-with-output.rst | 313 -- ...ification-to-openvino-with-output_19_0.png | 3 - docs/notebooks/tensorflow-hub-with-output.rst | 491 -- .../tensorflow-hub-with-output_11_0.png | 3 - .../tensorflow-hub-with-output_26_0.png | 3 - .../tensorflow-hub-with-output_43_0.png | 3 - ...e-segmentation-to-openvino-with-output.rst | 715 --- ...mentation-to-openvino-with-output_25_1.png | 3 - ...mentation-to-openvino-with-output_39_0.png | 3 - ...ject-detection-to-openvino-with-output.rst | 771 --- ...detection-to-openvino-with-output_25_1.png | 3 - ...detection-to-openvino-with-output_38_0.png | 3 - ...uantization-aware-training-with-output.rst | 531 -- ...ization-aware-training-with-output_6_1.png | 3 - .../text-to-image-genai-with-output.rst | 320 -- .../text-to-image-genai-with-output_13_0.jpg | 3 - .../text-to-image-genai-with-output_13_0.png | 3 - .../text-to-image-genai-with-output_9_0.jpg | 3 - .../text-to-image-genai-with-output_9_0.png | 3 - ...tflite-selfie-segmentation-with-output.rst | 621 --- ...e-selfie-segmentation-with-output_25_0.png | 3 - ...e-selfie-segmentation-with-output_33_0.png | 3 - .../tflite-to-openvino-with-output.rst | 239 - .../tflite-to-openvino-with-output_16_1.jpg | 3 - .../tflite-to-openvino-with-output_16_1.png | 3 - .../tiny-sd-image-generation-with-output.rst | 1136 ----- ...y-sd-image-generation-with-output_35_1.jpg | 3 - ...y-sd-image-generation-with-output_35_1.png | 3 - ...y-sd-image-generation-with-output_39_1.jpg | 3 - ...y-sd-image-generation-with-output_39_1.png | 3 - ...y-sd-image-generation-with-output_41_1.jpg | 3 - ...y-sd-image-generation-with-output_41_1.png | 3 - ...-detection-and-recognition-with-output.rst | 512 -- ...ction-and-recognition-with-output_14_0.png | 3 - ...ction-and-recognition-with-output_21_0.png | 3 - ...ction-and-recognition-with-output_27_0.png | 3 - .../vision-background-removal-with-output.rst | 436 -- ...on-background-removal-with-output_22_0.png | 3 - ...on-background-removal-with-output_24_0.png | 3 - .../vision-monodepth-with-output.rst | 537 -- .../vision-monodepth-with-output_18_0.png | 3 - docs/notebooks/wav2lip-with-output.rst | 265 - .../whisper-asr-genai-with-output.rst | 997 ---- ...isper-subtitles-generation-with-output.rst | 864 ---- .../yolov10-optimization-with-output.rst | 1115 ----- .../yolov10-optimization-with-output_13_1.jpg | 3 - .../yolov10-optimization-with-output_13_1.png | 3 - .../yolov10-optimization-with-output_19_0.jpg | 3 - .../yolov10-optimization-with-output_19_0.png | 3 - .../yolov10-optimization-with-output_34_0.jpg | 3 - .../yolov10-optimization-with-output_34_0.png | 3 - .../yolov10-optimization-with-output_38_0.jpg | 3 - .../yolov10-optimization-with-output_38_0.png | 3 - .../yolov10-optimization-with-output_50_0.png | 3 - ...ov11-instance-segmentation-with-output.rst | 1009 ---- ...instance-segmentation-with-output_10_1.jpg | 3 - ...instance-segmentation-with-output_10_1.png | 3 - ...instance-segmentation-with-output_19_1.jpg | 3 - ...instance-segmentation-with-output_19_1.png | 3 - ...instance-segmentation-with-output_34_1.jpg | 3 - ...instance-segmentation-with-output_34_1.png | 3 - ...instance-segmentation-with-output_46_0.png | 3 - ...yolov11-keypoint-detection-with-output.rst | 1126 ----- ...11-keypoint-detection-with-output_16_1.jpg | 3 - ...11-keypoint-detection-with-output_16_1.png | 3 - ...11-keypoint-detection-with-output_30_1.jpg | 3 - ...11-keypoint-detection-with-output_30_1.png | 3 - ...11-keypoint-detection-with-output_43_0.png | 3 - ...v11-keypoint-detection-with-output_9_3.jpg | 3 - ...v11-keypoint-detection-with-output_9_3.png | 3 - .../yolov11-object-detection-with-output.rst | 966 ---- ...ov11-object-detection-with-output_10_1.jpg | 3 - ...ov11-object-detection-with-output_10_1.png | 3 - ...ov11-object-detection-with-output_17_1.jpg | 3 - ...ov11-object-detection-with-output_17_1.png | 3 - ...ov11-object-detection-with-output_30_1.jpg | 3 - ...ov11-object-detection-with-output_30_1.png | 3 - ...ov11-object-detection-with-output_43_0.png | 3 - ...tion-with-accuracy-control-with-output.rst | 701 --- ...lov8-instance-segmentation-with-output.rst | 1364 ----- ...instance-segmentation-with-output_18_1.png | 3 - ...instance-segmentation-with-output_46_1.png | 3 - ...instance-segmentation-with-output_62_0.png | 3 - ...-instance-segmentation-with-output_9_3.png | 3 - .../yolov8-keypoint-detection-with-output.rst | 1318 ----- ...v8-keypoint-detection-with-output_16_1.png | 3 - ...v8-keypoint-detection-with-output_44_1.png | 3 - ...v8-keypoint-detection-with-output_60_0.png | 3 - ...ov8-keypoint-detection-with-output_9_3.png | 3 - docs/notebooks/yolov8-obb-with-output.rst | 730 --- .../yolov8-obb-with-output_10_1.png | 3 - .../yolov8-obb-with-output_20_1.png | 3 - .../yolov8-object-detection-with-output.rst | 1902 ------- ...lov8-object-detection-with-output_16_1.png | 3 - ...lov8-object-detection-with-output_43_1.png | 3 - ...lov8-object-detection-with-output_70_0.png | 3 - ...lov8-object-detection-with-output_76_0.png | 3 - ...olov8-object-detection-with-output_9_3.png | 3 - .../yolov9-optimization-with-output.rst | 952 ---- .../yolov9-optimization-with-output_16_0.jpg | 3 - .../yolov9-optimization-with-output_16_0.png | 3 - .../yolov9-optimization-with-output_27_0.jpg | 3 - .../yolov9-optimization-with-output_27_0.png | 3 - .../zeroscope-text2video-with-output.rst | 888 ---- .../notebooks/vision-background-removal.nnb | 2 +- .../vision_background_removal.js | 2 +- 634 files changed, 383 insertions(+), 136019 deletions(-) create mode 100644 docs/articles_en/get-started/learn-openvino/interactive-tutorials-python/run-notebooks.rst delete mode 100644 docs/nbdoc/README.md delete mode 100644 docs/nbdoc/__init__.py delete mode 100644 docs/nbdoc/consts.py delete mode 100644 docs/nbdoc/nbdoc.py delete mode 100644 docs/nbdoc/requirements.txt delete mode 100644 docs/nbdoc/utils.py delete mode 100644 docs/notebooks/3D-pose-estimation-with-output.rst delete mode 100644 docs/notebooks/3D-segmentation-point-clouds-with-output.rst delete mode 100644 docs/notebooks/3D-segmentation-point-clouds-with-output_files/3D-segmentation-point-clouds-with-output_11_2.png delete mode 100644 docs/notebooks/3D-segmentation-point-clouds-with-output_files/3D-segmentation-point-clouds-with-output_16_1.png delete mode 100644 docs/notebooks/action-recognition-webcam-with-output.rst delete mode 100644 docs/notebooks/action-recognition-webcam-with-output_files/action-recognition-webcam-with-output_22_0.png delete mode 100644 docs/notebooks/all_notebooks_paths.txt delete mode 100644 docs/notebooks/animate-anyone-with-output.rst delete mode 100644 docs/notebooks/async-api-with-output.rst delete mode 100644 docs/notebooks/async-api-with-output_files/async-api-with-output_17_0.png delete mode 100644 docs/notebooks/async-api-with-output_files/async-api-with-output_21_0.png delete mode 100644 docs/notebooks/async-api-with-output_files/async-api-with-output_23_0.png delete mode 100644 docs/notebooks/async-api-with-output_files/async-api-with-output_29_0.png delete mode 100644 docs/notebooks/auto-device-with-output.rst delete mode 100644 docs/notebooks/auto-device-with-output_files/auto-device-with-output_14_1.jpg delete mode 100644 docs/notebooks/auto-device-with-output_files/auto-device-with-output_14_1.png delete mode 100644 docs/notebooks/auto-device-with-output_files/auto-device-with-output_27_0.png delete mode 100644 docs/notebooks/auto-device-with-output_files/auto-device-with-output_28_0.png delete mode 100644 docs/notebooks/bark-text-to-audio-with-output.rst delete mode 100644 docs/notebooks/blip-visual-language-processing-with-output.rst delete mode 100644 docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_25_0.png delete mode 100644 docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_27_0.png delete mode 100644 docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_47_0.png delete mode 100644 docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_49_0.png delete mode 100644 docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_7_0.png delete mode 100644 docs/notebooks/catvton-with-output.rst delete mode 100644 docs/notebooks/clip-language-saliency-map-with-output.rst delete mode 100644 docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_15_0.png delete mode 100644 docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_17_0.png delete mode 100644 docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_19_1.png delete mode 100644 docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_29_1.png delete mode 100644 docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_35_1.png delete mode 100644 docs/notebooks/clip-zero-shot-classification-with-output.rst delete mode 100644 docs/notebooks/clip-zero-shot-classification-with-output_files/clip-zero-shot-classification-with-output_13_0.png delete mode 100644 docs/notebooks/clip-zero-shot-classification-with-output_files/clip-zero-shot-classification-with-output_26_0.png delete mode 100644 docs/notebooks/clip-zero-shot-classification-with-output_files/clip-zero-shot-classification-with-output_6_0.png delete mode 100644 docs/notebooks/controlnet-stable-diffusion-with-output.rst delete mode 100644 docs/notebooks/controlnet-stable-diffusion-with-output_files/controlnet-stable-diffusion-with-output_17_0.png delete mode 100644 docs/notebooks/controlnet-stable-diffusion-with-output_files/controlnet-stable-diffusion-with-output_34_1.jpg delete mode 100644 docs/notebooks/controlnet-stable-diffusion-with-output_files/controlnet-stable-diffusion-with-output_34_1.png delete mode 100644 docs/notebooks/controlnet-stable-diffusion-with-output_files/controlnet-stable-diffusion-with-output_50_0.png delete mode 100644 docs/notebooks/controlnet-stable-diffusion-with-output_files/controlnet-stable-diffusion-with-output_8_0.png delete mode 100644 docs/notebooks/convert-to-openvino-with-output.rst delete mode 100644 docs/notebooks/convnext-classification-with-output.rst delete mode 100644 docs/notebooks/ct-segmentation-quantize-nncf-with-output.rst delete mode 100644 docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_14_1.png delete mode 100644 docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_37_1.png delete mode 100644 docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_42_0.jpg delete mode 100644 docs/notebooks/ddcolor-image-colorization-with-output.rst delete mode 100644 docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_10_0.jpg delete mode 100644 docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_10_0.png delete mode 100644 docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_17_0.jpg delete mode 100644 docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_17_0.png delete mode 100644 docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_26_0.jpg delete mode 100644 docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_26_0.png delete mode 100644 docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_9_0.jpg delete mode 100644 docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_9_0.png delete mode 100644 docs/notebooks/deepseek-r1-with-output.rst delete mode 100644 docs/notebooks/depth-anything-v2-with-output.rst delete mode 100644 docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_15_1.png delete mode 100644 docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_25_1.png delete mode 100644 docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_44_0.png delete mode 100644 docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_9_1.jpg delete mode 100644 docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_9_1.png delete mode 100644 docs/notebooks/depth-anything-with-output.rst delete mode 100644 docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_11_1.jpg delete mode 100644 docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_11_1.png delete mode 100644 docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_18_0.png delete mode 100644 docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_27_0.png delete mode 100644 docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_46_0.png delete mode 100644 docs/notebooks/detectron2-to-openvino-with-output.rst delete mode 100644 docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.jpg delete mode 100644 docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.png delete mode 100644 docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.jpg delete mode 100644 docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.png delete mode 100644 docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_8_0.jpg delete mode 100644 docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_8_0.png delete mode 100644 docs/notebooks/distil-whisper-asr-with-output.rst delete mode 100644 docs/notebooks/dynamicrafter-animating-images-with-output.rst delete mode 100644 docs/notebooks/efficient-sam-with-output.rst delete mode 100644 docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_12_0.jpg delete mode 100644 docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_12_0.png delete mode 100644 docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_17_0.png delete mode 100644 docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_17_1.png delete mode 100644 docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_25_0.png delete mode 100644 docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_25_1.png delete mode 100644 docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_36_0.png delete mode 100644 docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_36_1.png delete mode 100644 docs/notebooks/encodec-audio-compression-with-output.rst delete mode 100644 docs/notebooks/encodec-audio-compression-with-output_files/encodec-audio-compression-with-output_19_1.png delete mode 100644 docs/notebooks/encodec-audio-compression-with-output_files/encodec-audio-compression-with-output_38_1.png delete mode 100644 docs/notebooks/encodec-audio-compression-with-output_files/encodec-audio-compression-with-output_6_2.png delete mode 100644 docs/notebooks/explainable-ai-1-basic-with-output.rst delete mode 100644 docs/notebooks/explainable-ai-1-basic-with-output_files/explainable-ai-1-basic-with-output_11_2.png delete mode 100644 docs/notebooks/explainable-ai-1-basic-with-output_files/explainable-ai-1-basic-with-output_19_0.png delete mode 100644 docs/notebooks/explainable-ai-2-deep-dive-with-output.rst delete mode 100644 docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_11_2.png delete mode 100644 docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_22_1.png delete mode 100644 docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_25_1.png delete mode 100644 docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_34_1.png delete mode 100644 docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_35_0.png delete mode 100644 docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_36_0.png delete mode 100644 docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_49_0.png delete mode 100644 docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_52_0.png delete mode 100644 docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_63_1.png delete mode 100644 docs/notebooks/explainable-ai-3-map-interpretation-with-output.rst delete mode 100644 docs/notebooks/explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_50_0.png delete mode 100644 docs/notebooks/explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_53_0.png delete mode 100644 docs/notebooks/explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_56_0.png delete mode 100644 docs/notebooks/explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_59_0.png delete mode 100644 docs/notebooks/fast-segment-anything-with-output.rst delete mode 100644 docs/notebooks/fast-segment-anything-with-output_files/fast-segment-anything-with-output_21_0.jpg delete mode 100644 docs/notebooks/fast-segment-anything-with-output_files/fast-segment-anything-with-output_21_0.png delete mode 100644 docs/notebooks/fast-segment-anything-with-output_files/fast-segment-anything-with-output_9_0.jpg delete mode 100644 docs/notebooks/fast-segment-anything-with-output_files/fast-segment-anything-with-output_9_0.png delete mode 100644 docs/notebooks/film-slowmo-with-output.rst delete mode 100644 docs/notebooks/film-slowmo-with-output_files/film-slowmo-with-output_14_0.png delete mode 100644 docs/notebooks/film-slowmo-with-output_files/film-slowmo-with-output_29_0.png delete mode 100644 docs/notebooks/film-slowmo-with-output_files/film-slowmo-with-output_7_0.png delete mode 100644 docs/notebooks/florence2-with-output.rst delete mode 100644 docs/notebooks/florence2-with-output_files/florence2-with-output_15_0.jpg delete mode 100644 docs/notebooks/florence2-with-output_files/florence2-with-output_15_0.png delete mode 100644 docs/notebooks/florence2-with-output_files/florence2-with-output_18_0.png delete mode 100644 docs/notebooks/flux.1-image-generation-with-output.rst delete mode 100644 docs/notebooks/flux.1-image-generation-with-output_files/flux.1-image-generation-with-output_16_1.jpg delete mode 100644 docs/notebooks/flux.1-image-generation-with-output_files/flux.1-image-generation-with-output_16_1.png delete mode 100644 docs/notebooks/freevc-voice-conversion-with-output.rst delete mode 100644 docs/notebooks/glm-edge-v-with-output.rst delete mode 100644 docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_12_1.jpg delete mode 100644 docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_12_1.png delete mode 100644 docs/notebooks/gpu-device-with-output.rst delete mode 100644 docs/notebooks/grammar-correction-with-output.rst delete mode 100644 docs/notebooks/grounded-segment-anything-with-output.rst delete mode 100644 docs/notebooks/grounded-segment-anything-with-output_files/grounded-segment-anything-with-output_30_0.jpg delete mode 100644 docs/notebooks/grounded-segment-anything-with-output_files/grounded-segment-anything-with-output_30_0.png delete mode 100644 docs/notebooks/grounded-segment-anything-with-output_files/grounded-segment-anything-with-output_46_0.jpg delete mode 100644 docs/notebooks/grounded-segment-anything-with-output_files/grounded-segment-anything-with-output_46_0.png delete mode 100644 docs/notebooks/handwritten-ocr-with-output.rst delete mode 100644 docs/notebooks/handwritten-ocr-with-output_files/handwritten-ocr-with-output_22_0.png delete mode 100644 docs/notebooks/handwritten-ocr-with-output_files/handwritten-ocr-with-output_32_1.png delete mode 100644 docs/notebooks/hello-detection-with-output.rst delete mode 100644 docs/notebooks/hello-detection-with-output_files/hello-detection-with-output_11_1.png delete mode 100644 docs/notebooks/hello-detection-with-output_files/hello-detection-with-output_16_0.png delete mode 100644 docs/notebooks/hello-npu-with-output.rst delete mode 100644 docs/notebooks/hello-segmentation-with-output.rst delete mode 100644 docs/notebooks/hello-segmentation-with-output_files/hello-segmentation-with-output_11_2.png delete mode 100644 docs/notebooks/hello-segmentation-with-output_files/hello-segmentation-with-output_13_1.png delete mode 100644 docs/notebooks/hello-segmentation-with-output_files/hello-segmentation-with-output_17_0.png delete mode 100644 docs/notebooks/hello-world-with-output.rst delete mode 100644 docs/notebooks/hello-world-with-output_files/hello-world-with-output_11_1.png delete mode 100644 docs/notebooks/hugging-face-hub-with-output.rst delete mode 100644 docs/notebooks/hunyuan-dit-image-generation-with-output.rst delete mode 100644 docs/notebooks/hunyuan-dit-image-generation-with-output_files/hunyuan-dit-image-generation-with-output_31_0.jpg delete mode 100644 docs/notebooks/hunyuan-dit-image-generation-with-output_files/hunyuan-dit-image-generation-with-output_31_0.png delete mode 100644 docs/notebooks/image-classification-quantization-with-output.rst delete mode 100644 docs/notebooks/image-classification-quantization-with-output_files/image-classification-quantization-with-output_31_2.png delete mode 100644 docs/notebooks/image-to-image-genai-with-output.rst delete mode 100644 docs/notebooks/image-to-image-genai-with-output_files/image-to-image-genai-with-output_11_0.jpg delete mode 100644 docs/notebooks/image-to-image-genai-with-output_files/image-to-image-genai-with-output_11_0.png delete mode 100644 docs/notebooks/image-to-image-genai-with-output_files/image-to-image-genai-with-output_13_1.jpg delete mode 100644 docs/notebooks/image-to-image-genai-with-output_files/image-to-image-genai-with-output_13_1.png delete mode 100644 docs/notebooks/image-to-image-genai-with-output_files/image-to-image-genai-with-output_15_0.jpg delete mode 100644 docs/notebooks/image-to-image-genai-with-output_files/image-to-image-genai-with-output_15_0.png delete mode 100644 docs/notebooks/image-to-image-genai-with-output_files/image-to-image-genai-with-output_16_1.jpg delete mode 100644 docs/notebooks/image-to-image-genai-with-output_files/image-to-image-genai-with-output_16_1.png delete mode 100644 docs/notebooks/inpainting-genai-with-output.rst delete mode 100644 docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_10_0.jpg delete mode 100644 docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_10_0.png delete mode 100644 docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_12_1.jpg delete mode 100644 docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_12_1.png delete mode 100644 docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_14_0.jpg delete mode 100644 docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_14_0.png delete mode 100644 docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_15_1.jpg delete mode 100644 docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_15_1.png delete mode 100644 docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_18_2.jpg delete mode 100644 docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_18_2.png delete mode 100644 docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_21_1.jpg delete mode 100644 docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_21_1.png delete mode 100644 docs/notebooks/instant-id-with-output.rst delete mode 100644 docs/notebooks/instant-id-with-output_files/instant-id-with-output_16_0.jpg delete mode 100644 docs/notebooks/instant-id-with-output_files/instant-id-with-output_16_0.png delete mode 100644 docs/notebooks/instant-id-with-output_files/instant-id-with-output_17_0.jpg delete mode 100644 docs/notebooks/instant-id-with-output_files/instant-id-with-output_17_0.png delete mode 100644 docs/notebooks/instant-id-with-output_files/instant-id-with-output_40_0.jpg delete mode 100644 docs/notebooks/instant-id-with-output_files/instant-id-with-output_40_0.png delete mode 100644 docs/notebooks/instant-id-with-output_files/instant-id-with-output_66_0.png delete mode 100644 docs/notebooks/instruct-pix2pix-image-editing-with-output.rst delete mode 100644 docs/notebooks/instruct-pix2pix-image-editing-with-output_files/instruct-pix2pix-image-editing-with-output_24_0.png delete mode 100644 docs/notebooks/instruct-pix2pix-image-editing-with-output_files/instruct-pix2pix-image-editing-with-output_36_2.png delete mode 100644 docs/notebooks/internvl2-with-output.rst delete mode 100644 docs/notebooks/internvl2-with-output_files/internvl2-with-output_14_0.jpg delete mode 100644 docs/notebooks/internvl2-with-output_files/internvl2-with-output_14_0.png delete mode 100644 docs/notebooks/janus-multimodal-generation-with-output.rst delete mode 100644 docs/notebooks/janus-multimodal-generation-with-output_files/janus-multimodal-generation-with-output_14_1.jpg delete mode 100644 docs/notebooks/janus-multimodal-generation-with-output_files/janus-multimodal-generation-with-output_14_1.png delete mode 100644 docs/notebooks/janus-multimodal-generation-with-output_files/janus-multimodal-generation-with-output_18_0.jpg delete mode 100644 docs/notebooks/janus-multimodal-generation-with-output_files/janus-multimodal-generation-with-output_18_0.png delete mode 100644 docs/notebooks/jax-classification-to-openvino-with-output.rst delete mode 100644 docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.jpg delete mode 100644 docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.png delete mode 100644 docs/notebooks/jina-clip-with-output.rst delete mode 100644 docs/notebooks/jina-clip-with-output_files/jina-clip-with-output_11_0.png delete mode 100644 docs/notebooks/jina-clip-with-output_files/jina-clip-with-output_21_0.png delete mode 100644 docs/notebooks/jina-clip-with-output_files/jina-clip-with-output_37_0.png delete mode 100644 docs/notebooks/knowledge-graphs-conve-with-output.rst delete mode 100644 docs/notebooks/kosmos2-multimodal-large-language-model-with-output.rst delete mode 100644 docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.jpg delete mode 100644 docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.png delete mode 100644 docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_48_1.png delete mode 100644 docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.jpg delete mode 100644 docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.png delete mode 100644 docs/notebooks/language-quantize-bert-with-output.rst delete mode 100644 docs/notebooks/latent-consistency-models-image-generation-with-output.rst delete mode 100644 docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_13_0.jpg delete mode 100644 docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_13_0.png delete mode 100644 docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_27_1.jpg delete mode 100644 docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_27_1.png delete mode 100644 docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_37_0.jpg delete mode 100644 docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_37_0.png delete mode 100644 docs/notebooks/lcm-lora-controlnet-with-output.rst delete mode 100644 docs/notebooks/lcm-lora-controlnet-with-output_files/lcm-lora-controlnet-with-output_10_1.png delete mode 100644 docs/notebooks/lcm-lora-controlnet-with-output_files/lcm-lora-controlnet-with-output_27_2.jpg delete mode 100644 docs/notebooks/lcm-lora-controlnet-with-output_files/lcm-lora-controlnet-with-output_27_2.png delete mode 100644 docs/notebooks/lcm-lora-controlnet-with-output_files/lcm-lora-controlnet-with-output_28_0.png delete mode 100644 docs/notebooks/lcm-lora-controlnet-with-output_files/lcm-lora-controlnet-with-output_42_1.png delete mode 100644 docs/notebooks/llava-multimodal-chatbot-genai-with-output.rst delete mode 100644 docs/notebooks/llava-multimodal-chatbot-genai-with-output_files/llava-multimodal-chatbot-genai-with-output_21_0.jpg delete mode 100644 docs/notebooks/llava-multimodal-chatbot-genai-with-output_files/llava-multimodal-chatbot-genai-with-output_21_0.png delete mode 100644 docs/notebooks/llava-multimodal-chatbot-optimum-with-output.rst delete mode 100644 docs/notebooks/llava-multimodal-chatbot-optimum-with-output_files/llava-multimodal-chatbot-optimum-with-output_20_0.jpg delete mode 100644 docs/notebooks/llava-multimodal-chatbot-optimum-with-output_files/llava-multimodal-chatbot-optimum-with-output_20_0.png delete mode 100644 docs/notebooks/llava-next-multimodal-chatbot-with-output.rst delete mode 100644 docs/notebooks/llava-next-multimodal-chatbot-with-output_files/llava-next-multimodal-chatbot-with-output_18_0.jpg delete mode 100644 docs/notebooks/llava-next-multimodal-chatbot-with-output_files/llava-next-multimodal-chatbot-with-output_18_0.png delete mode 100644 docs/notebooks/llm-agent-functioncall-qwen-with-output.rst delete mode 100644 docs/notebooks/llm-agent-rag-llamaindex-with-output.rst delete mode 100644 docs/notebooks/llm-agent-react-langchain-with-output.rst delete mode 100644 docs/notebooks/llm-agent-react-with-output.rst delete mode 100644 docs/notebooks/llm-chatbot-generate-api-with-output.rst delete mode 100644 docs/notebooks/llm-chatbot-with-output.rst delete mode 100644 docs/notebooks/llm-question-answering-with-output.rst delete mode 100644 docs/notebooks/llm-rag-langchain-genai-with-output.rst delete mode 100644 docs/notebooks/llm-rag-langchain-with-output.rst delete mode 100644 docs/notebooks/llm-rag-llamaindex-with-output.rst delete mode 100644 docs/notebooks/localai-with-output.rst delete mode 100644 docs/notebooks/ltx-video-with-output.rst delete mode 100644 docs/notebooks/magika-content-type-recognition-with-output.rst delete mode 100644 docs/notebooks/meter-reader-with-output.rst delete mode 100644 docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_16_1.png delete mode 100644 docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_18_1.png delete mode 100644 docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_20_1.png delete mode 100644 docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_22_1.png delete mode 100644 docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_24_1.png delete mode 100644 docs/notebooks/minicpm-v-multimodal-chatbot-with-output.rst delete mode 100644 docs/notebooks/minicpm-v-multimodal-chatbot-with-output_files/minicpm-v-multimodal-chatbot-with-output_12_1.jpg delete mode 100644 docs/notebooks/minicpm-v-multimodal-chatbot-with-output_files/minicpm-v-multimodal-chatbot-with-output_12_1.png delete mode 100644 docs/notebooks/mllama-3.2-with-output.rst delete mode 100644 docs/notebooks/mllama-3.2-with-output_files/mllama-3.2-with-output_19_1.jpg delete mode 100644 docs/notebooks/mllama-3.2-with-output_files/mllama-3.2-with-output_19_1.png delete mode 100644 docs/notebooks/mms-massively-multilingual-speech-with-output.rst delete mode 100644 docs/notebooks/mobileclip-video-search-with-output.rst delete mode 100644 docs/notebooks/mobileclip-video-search-with-output_files/mobileclip-video-search-with-output_12_0.png delete mode 100644 docs/notebooks/mobileclip-video-search-with-output_files/mobileclip-video-search-with-output_17_1.png delete mode 100644 docs/notebooks/mobileclip-video-search-with-output_files/mobileclip-video-search-with-output_28_1.png delete mode 100644 docs/notebooks/model-server-with-output.rst delete mode 100644 docs/notebooks/model-server-with-output_files/model-server-with-output_23_2.png delete mode 100644 docs/notebooks/model-server-with-output_files/model-server-with-output_28_1.png delete mode 100644 docs/notebooks/modelscope-to-openvino-with-output.rst delete mode 100644 docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.jpg delete mode 100644 docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.png delete mode 100644 docs/notebooks/multilora-image-generation-with-output.rst delete mode 100644 docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_15_0.jpg delete mode 100644 docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_15_0.png delete mode 100644 docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_18_0.jpg delete mode 100644 docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_18_0.png delete mode 100644 docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_21_0.jpg delete mode 100644 docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_21_0.png delete mode 100644 docs/notebooks/multimodal-rag-llamaindex-with-output.rst delete mode 100644 docs/notebooks/multimodal-rag-llamaindex-with-output_files/multimodal-rag-llamaindex-with-output_29_3.png delete mode 100644 docs/notebooks/music-generation-with-output.rst delete mode 100644 docs/notebooks/named-entity-recognition-with-output.rst delete mode 100644 docs/notebooks/nano-llava-multimodal-chatbot-with-output.rst delete mode 100644 docs/notebooks/nano-llava-multimodal-chatbot-with-output_files/nano-llava-multimodal-chatbot-with-output_22_0.jpg delete mode 100644 docs/notebooks/nano-llava-multimodal-chatbot-with-output_files/nano-llava-multimodal-chatbot-with-output_22_0.png delete mode 100644 docs/notebooks/notebooks_with_binder_buttons.txt delete mode 100644 docs/notebooks/notebooks_with_colab_buttons.txt delete mode 100644 docs/notebooks/nuextract-structure-extraction-with-output.rst delete mode 100644 docs/notebooks/object-detection-with-output.rst delete mode 100644 docs/notebooks/object-detection-with-output_files/object-detection-with-output_13_0.png delete mode 100644 docs/notebooks/omnigen-with-output.rst delete mode 100644 docs/notebooks/omnigen-with-output_files/omnigen-with-output_14_1.jpg delete mode 100644 docs/notebooks/omnigen-with-output_files/omnigen-with-output_14_1.png delete mode 100644 docs/notebooks/omnigen-with-output_files/omnigen-with-output_16_1.jpg delete mode 100644 docs/notebooks/omnigen-with-output_files/omnigen-with-output_16_1.png delete mode 100644 docs/notebooks/omniparser-with-output.rst delete mode 100644 docs/notebooks/omniparser-with-output_files/omniparser-with-output_32_0.jpg delete mode 100644 docs/notebooks/omniparser-with-output_files/omniparser-with-output_32_0.png delete mode 100644 docs/notebooks/oneformer-segmentation-with-output.rst delete mode 100644 docs/notebooks/oneformer-segmentation-with-output_files/oneformer-segmentation-with-output_23_1.jpg delete mode 100644 docs/notebooks/oneformer-segmentation-with-output_files/oneformer-segmentation-with-output_23_1.png delete mode 100644 docs/notebooks/oneformer-segmentation-with-output_files/oneformer-segmentation-with-output_27_0.jpg delete mode 100644 docs/notebooks/oneformer-segmentation-with-output_files/oneformer-segmentation-with-output_27_0.png delete mode 100644 docs/notebooks/oneformer-segmentation-with-output_files/oneformer-segmentation-with-output_39_1.jpg delete mode 100644 docs/notebooks/oneformer-segmentation-with-output_files/oneformer-segmentation-with-output_39_1.png delete mode 100644 docs/notebooks/oneformer-segmentation-with-output_files/oneformer-segmentation-with-output_39_3.jpg delete mode 100644 docs/notebooks/oneformer-segmentation-with-output_files/oneformer-segmentation-with-output_39_3.png delete mode 100644 docs/notebooks/openvino-api-with-output.rst delete mode 100644 docs/notebooks/openvino-tokenizers-with-output.rst delete mode 100644 docs/notebooks/openvoice-with-output.rst delete mode 100644 docs/notebooks/optical-character-recognition-with-output.rst delete mode 100644 docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_13_0.png delete mode 100644 docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_23_0.png delete mode 100644 docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_0.jpg delete mode 100644 docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_0.png delete mode 100644 docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_10.jpg delete mode 100644 docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_10.png delete mode 100644 docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_2.jpg delete mode 100644 docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_2.png delete mode 100644 docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_4.jpg delete mode 100644 docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_4.png delete mode 100644 docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_6.jpg delete mode 100644 docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_6.png delete mode 100644 docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_8.jpg delete mode 100644 docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_8.png delete mode 100644 docs/notebooks/optimize-preprocessing-with-output.rst delete mode 100644 docs/notebooks/optimize-preprocessing-with-output_files/optimize-preprocessing-with-output_14_1.png delete mode 100644 docs/notebooks/outetts-text-to-speech-with-output.rst delete mode 100644 docs/notebooks/paddle-ocr-webcam-with-output.rst delete mode 100644 docs/notebooks/paddle-to-openvino-classification-with-output.rst delete mode 100644 docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_14_3.png delete mode 100644 docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_22_1.png delete mode 100644 docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_26_1.png delete mode 100644 docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_29_1.png delete mode 100644 docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_7_1.png delete mode 100644 docs/notebooks/parler-tts-text-to-speech-with-output.rst delete mode 100644 docs/notebooks/person-counting-with-output.rst delete mode 100644 docs/notebooks/person-counting-with-output_files/person-counting-with-output_14_0.png delete mode 100644 docs/notebooks/person-tracking-with-output.rst delete mode 100644 docs/notebooks/phi-3-vision-with-output.rst delete mode 100644 docs/notebooks/phi-3-vision-with-output_files/phi-3-vision-with-output_14_1.jpg delete mode 100644 docs/notebooks/phi-3-vision-with-output_files/phi-3-vision-with-output_14_1.png delete mode 100644 docs/notebooks/photo-maker-with-output.rst delete mode 100644 docs/notebooks/photo-maker-with-output_files/photo-maker-with-output_34_0.png delete mode 100644 docs/notebooks/pix2struct-docvqa-with-output.rst delete mode 100644 docs/notebooks/pix2struct-docvqa-with-output_files/pix2struct-docvqa-with-output_11_0.jpg delete mode 100644 docs/notebooks/pix2struct-docvqa-with-output_files/pix2struct-docvqa-with-output_11_0.png delete mode 100644 docs/notebooks/pixart-with-output.rst delete mode 100644 docs/notebooks/pixart-with-output_files/pixart-with-output_27_0.jpg delete mode 100644 docs/notebooks/pixart-with-output_files/pixart-with-output_27_0.png delete mode 100644 docs/notebooks/pixart-with-output_files/pixart-with-output_40_2.png delete mode 100644 docs/notebooks/pixart-with-output_files/pixart-with-output_6_0.jpg delete mode 100644 docs/notebooks/pixart-with-output_files/pixart-with-output_6_0.png delete mode 100644 docs/notebooks/pixtral-with-output.rst delete mode 100644 docs/notebooks/pixtral-with-output_files/pixtral-with-output_12_1.jpg delete mode 100644 docs/notebooks/pixtral-with-output_files/pixtral-with-output_12_1.png delete mode 100644 docs/notebooks/pose-estimation-with-output.rst delete mode 100644 docs/notebooks/pytorch-onnx-to-openvino-with-output.rst delete mode 100644 docs/notebooks/pytorch-onnx-to-openvino-with-output_files/pytorch-onnx-to-openvino-with-output_22_0.png delete mode 100644 docs/notebooks/pytorch-onnx-to-openvino-with-output_files/pytorch-onnx-to-openvino-with-output_27_0.png delete mode 100644 docs/notebooks/pytorch-onnx-to-openvino-with-output_files/pytorch-onnx-to-openvino-with-output_29_0.png delete mode 100644 docs/notebooks/pytorch-post-training-quantization-nncf-with-output.rst delete mode 100644 docs/notebooks/pytorch-quantization-aware-training-with-output.rst delete mode 100644 docs/notebooks/pytorch-quantization-sparsity-aware-training-with-output.rst delete mode 100644 docs/notebooks/pytorch-to-openvino-with-output.rst delete mode 100644 docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_11_0.png delete mode 100644 docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_20_0.png delete mode 100644 docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_31_0.png delete mode 100644 docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_35_0.png delete mode 100644 docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_39_0.png delete mode 100644 docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_43_0.png delete mode 100644 docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_47_0.png delete mode 100644 docs/notebooks/qrcode-monster-with-output.rst delete mode 100644 docs/notebooks/qrcode-monster-with-output_files/qrcode-monster-with-output_22_1.jpg delete mode 100644 docs/notebooks/qrcode-monster-with-output_files/qrcode-monster-with-output_22_1.png delete mode 100644 docs/notebooks/qrcode-monster-with-output_files/qrcode-monster-with-output_39_0.png delete mode 100644 docs/notebooks/qwen2-audio-with-output.rst delete mode 100644 docs/notebooks/qwen2-vl-with-output.rst delete mode 100644 docs/notebooks/qwen2-vl-with-output_files/qwen2-vl-with-output_15_0.jpg delete mode 100644 docs/notebooks/qwen2-vl-with-output_files/qwen2-vl-with-output_15_0.png delete mode 100644 docs/notebooks/riffusion-text-to-music-with-output.rst delete mode 100644 docs/notebooks/riffusion-text-to-music-with-output_files/riffusion-text-to-music-with-output_14_0.jpg delete mode 100644 docs/notebooks/riffusion-text-to-music-with-output_files/riffusion-text-to-music-with-output_14_0.png delete mode 100644 docs/notebooks/rmbg-background-removal-with-output.rst delete mode 100644 docs/notebooks/rmbg-background-removal-with-output_files/rmbg-background-removal-with-output_14_0.png delete mode 100644 docs/notebooks/rmbg-background-removal-with-output_files/rmbg-background-removal-with-output_8_0.png delete mode 100644 docs/notebooks/s3d-mil-nce-text-to-video-retrieval-with-output.rst delete mode 100644 docs/notebooks/sana-image-generation-with-output.rst delete mode 100644 docs/notebooks/sana-image-generation-with-output_files/sana-image-generation-with-output_15_1.jpg delete mode 100644 docs/notebooks/sana-image-generation-with-output_files/sana-image-generation-with-output_15_1.png delete mode 100644 docs/notebooks/sdxl-turbo-with-output.rst delete mode 100644 docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_11_1.jpg delete mode 100644 docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_11_1.png delete mode 100644 docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_17_1.jpg delete mode 100644 docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_17_1.png delete mode 100644 docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_29_3.jpg delete mode 100644 docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_29_3.png delete mode 100644 docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_30_3.jpg delete mode 100644 docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_30_3.png delete mode 100644 docs/notebooks/segment-anything-2-image-with-output.rst delete mode 100644 docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_12_0.png delete mode 100644 docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_16_0.png delete mode 100644 docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_18_0.png delete mode 100644 docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_33_0.png delete mode 100644 docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_39_0.png delete mode 100644 docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_47_0.png delete mode 100644 docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_51_0.png delete mode 100644 docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_56_0.png delete mode 100644 docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_60_0.png delete mode 100644 docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_65_0.png delete mode 100644 docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_77_1.jpg delete mode 100644 docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_77_1.png delete mode 100644 docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_92_0.png delete mode 100644 docs/notebooks/segment-anything-2-video-with-output.rst delete mode 100644 docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_40_1.png delete mode 100644 docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_46_0.png delete mode 100644 docs/notebooks/segment-anything-with-output.rst delete mode 100644 docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_21_0.png delete mode 100644 docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_28_0.png delete mode 100644 docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_35_0.png delete mode 100644 docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_39_0.png delete mode 100644 docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_44_0.png delete mode 100644 docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_48_0.png delete mode 100644 docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_53_0.png delete mode 100644 docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_69_1.jpg delete mode 100644 docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_69_1.png delete mode 100644 docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_81_0.png delete mode 100644 docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_83_1.jpg delete mode 100644 docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_83_1.png delete mode 100644 docs/notebooks/siglip-zero-shot-image-classification-with-output.rst delete mode 100644 docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_13_1.png delete mode 100644 docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_24_1.png delete mode 100644 docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_6_1.png delete mode 100644 docs/notebooks/sketch-to-image-pix2pix-turbo-with-output.rst delete mode 100644 docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_15_0.jpg delete mode 100644 docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_15_0.png delete mode 100644 docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.jpg delete mode 100644 docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.png delete mode 100644 docs/notebooks/softvc-voice-conversion-with-output.rst delete mode 100644 docs/notebooks/sound-generation-audioldm2-with-output.rst delete mode 100644 docs/notebooks/sparsity-optimization-with-output.rst delete mode 100644 docs/notebooks/speculative-sampling-with-output.rst delete mode 100644 docs/notebooks/speech-recognition-quantization-wav2vec2-with-output.rst delete mode 100644 docs/notebooks/speechbrain-emotion-recognition-with-output.rst delete mode 100644 docs/notebooks/stable-audio-with-output.rst delete mode 100644 docs/notebooks/stable-cascade-image-generation-with-output.rst delete mode 100644 docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_29_2.jpg delete mode 100644 docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_29_2.png delete mode 100644 docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_8_2.jpg delete mode 100644 docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_8_2.png delete mode 100644 docs/notebooks/stable-diffusion-ip-adapter-with-output.rst delete mode 100644 docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_22_1.png delete mode 100644 docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_25_0.png delete mode 100644 docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_28_0.png delete mode 100644 docs/notebooks/stable-diffusion-keras-cv-with-output.rst delete mode 100644 docs/notebooks/stable-diffusion-keras-cv-with-output_files/stable-diffusion-keras-cv-with-output_23_1.png delete mode 100644 docs/notebooks/stable-diffusion-keras-cv-with-output_files/stable-diffusion-keras-cv-with-output_38_0.png delete mode 100644 docs/notebooks/stable-diffusion-text-to-image-with-output.rst delete mode 100644 docs/notebooks/stable-diffusion-text-to-image-with-output_files/stable-diffusion-text-to-image-with-output_17_1.jpg delete mode 100644 docs/notebooks/stable-diffusion-text-to-image-with-output_files/stable-diffusion-text-to-image-with-output_17_1.png delete mode 100644 docs/notebooks/stable-diffusion-text-to-image-with-output_files/stable-diffusion-text-to-image-with-output_27_1.jpg delete mode 100644 docs/notebooks/stable-diffusion-text-to-image-with-output_files/stable-diffusion-text-to-image-with-output_27_1.png delete mode 100644 docs/notebooks/stable-diffusion-text-to-image-with-output_files/stable-diffusion-text-to-image-with-output_31_1.jpg delete mode 100644 docs/notebooks/stable-diffusion-text-to-image-with-output_files/stable-diffusion-text-to-image-with-output_31_1.png delete mode 100644 docs/notebooks/stable-diffusion-torchdynamo-backend-with-output.rst delete mode 100644 docs/notebooks/stable-diffusion-torchdynamo-backend-with-output_files/stable-diffusion-torchdynamo-backend-with-output_14_1.jpg delete mode 100644 docs/notebooks/stable-diffusion-torchdynamo-backend-with-output_files/stable-diffusion-torchdynamo-backend-with-output_14_1.png delete mode 100644 docs/notebooks/stable-diffusion-v2-infinite-zoom-with-output.rst delete mode 100644 docs/notebooks/stable-diffusion-v2-optimum-demo-with-output.rst delete mode 100644 docs/notebooks/stable-diffusion-v2-optimum-demo-with-output_files/stable-diffusion-v2-optimum-demo-with-output_13_1.jpg delete mode 100644 docs/notebooks/stable-diffusion-v2-optimum-demo-with-output_files/stable-diffusion-v2-optimum-demo-with-output_13_1.png delete mode 100644 docs/notebooks/stable-diffusion-v2-text-to-image-demo-with-output.rst delete mode 100644 docs/notebooks/stable-diffusion-v2-text-to-image-demo-with-output_files/stable-diffusion-v2-text-to-image-demo-with-output_24_0.jpg delete mode 100644 docs/notebooks/stable-diffusion-v2-text-to-image-demo-with-output_files/stable-diffusion-v2-text-to-image-demo-with-output_24_0.png delete mode 100644 docs/notebooks/stable-diffusion-v2-text-to-image-with-output.rst delete mode 100644 docs/notebooks/stable-diffusion-v3-torch-fx-with-output.rst delete mode 100644 docs/notebooks/stable-diffusion-v3-with-output.rst delete mode 100644 docs/notebooks/stable-diffusion-v3-with-output_files/stable-diffusion-v3-with-output_16_1.jpg delete mode 100644 docs/notebooks/stable-diffusion-v3-with-output_files/stable-diffusion-v3-with-output_16_1.png delete mode 100644 docs/notebooks/stable-diffusion-v3-with-output_files/stable-diffusion-v3-with-output_30_1.png delete mode 100644 docs/notebooks/stable-diffusion-xl-with-output.rst delete mode 100644 docs/notebooks/stable-diffusion-xl-with-output_files/stable-diffusion-xl-with-output_12_1.jpg delete mode 100644 docs/notebooks/stable-diffusion-xl-with-output_files/stable-diffusion-xl-with-output_12_1.png delete mode 100644 docs/notebooks/stable-diffusion-xl-with-output_files/stable-diffusion-xl-with-output_20_1.jpg delete mode 100644 docs/notebooks/stable-diffusion-xl-with-output_files/stable-diffusion-xl-with-output_20_1.png delete mode 100644 docs/notebooks/stable-fast-3d-with-output.rst delete mode 100644 docs/notebooks/stable-video-diffusion-with-output.rst delete mode 100644 docs/notebooks/style-transfer-with-output.rst delete mode 100644 docs/notebooks/surya-line-level-text-detection-with-output.rst delete mode 100644 docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_13_1.jpg delete mode 100644 docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_13_1.png delete mode 100644 docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_24_1.jpg delete mode 100644 docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_24_1.png delete mode 100644 docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_3_0.jpg delete mode 100644 docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_3_0.png delete mode 100644 docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_6_6.jpg delete mode 100644 docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_6_6.png delete mode 100644 docs/notebooks/table-question-answering-with-output.rst delete mode 100644 docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output.rst delete mode 100644 docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_10_1.png delete mode 100644 docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_27_1.png delete mode 100644 docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_9_1.png delete mode 100644 docs/notebooks/tensorflow-classification-to-openvino-with-output.rst delete mode 100644 docs/notebooks/tensorflow-classification-to-openvino-with-output_files/tensorflow-classification-to-openvino-with-output_19_0.png delete mode 100644 docs/notebooks/tensorflow-hub-with-output.rst delete mode 100644 docs/notebooks/tensorflow-hub-with-output_files/tensorflow-hub-with-output_11_0.png delete mode 100644 docs/notebooks/tensorflow-hub-with-output_files/tensorflow-hub-with-output_26_0.png delete mode 100644 docs/notebooks/tensorflow-hub-with-output_files/tensorflow-hub-with-output_43_0.png delete mode 100644 docs/notebooks/tensorflow-instance-segmentation-to-openvino-with-output.rst delete mode 100644 docs/notebooks/tensorflow-instance-segmentation-to-openvino-with-output_files/tensorflow-instance-segmentation-to-openvino-with-output_25_1.png delete mode 100644 docs/notebooks/tensorflow-instance-segmentation-to-openvino-with-output_files/tensorflow-instance-segmentation-to-openvino-with-output_39_0.png delete mode 100644 docs/notebooks/tensorflow-object-detection-to-openvino-with-output.rst delete mode 100644 docs/notebooks/tensorflow-object-detection-to-openvino-with-output_files/tensorflow-object-detection-to-openvino-with-output_25_1.png delete mode 100644 docs/notebooks/tensorflow-object-detection-to-openvino-with-output_files/tensorflow-object-detection-to-openvino-with-output_38_0.png delete mode 100644 docs/notebooks/tensorflow-quantization-aware-training-with-output.rst delete mode 100644 docs/notebooks/tensorflow-quantization-aware-training-with-output_files/tensorflow-quantization-aware-training-with-output_6_1.png delete mode 100644 docs/notebooks/text-to-image-genai-with-output.rst delete mode 100644 docs/notebooks/text-to-image-genai-with-output_files/text-to-image-genai-with-output_13_0.jpg delete mode 100644 docs/notebooks/text-to-image-genai-with-output_files/text-to-image-genai-with-output_13_0.png delete mode 100644 docs/notebooks/text-to-image-genai-with-output_files/text-to-image-genai-with-output_9_0.jpg delete mode 100644 docs/notebooks/text-to-image-genai-with-output_files/text-to-image-genai-with-output_9_0.png delete mode 100644 docs/notebooks/tflite-selfie-segmentation-with-output.rst delete mode 100644 docs/notebooks/tflite-selfie-segmentation-with-output_files/tflite-selfie-segmentation-with-output_25_0.png delete mode 100644 docs/notebooks/tflite-selfie-segmentation-with-output_files/tflite-selfie-segmentation-with-output_33_0.png delete mode 100644 docs/notebooks/tflite-to-openvino-with-output.rst delete mode 100644 docs/notebooks/tflite-to-openvino-with-output_files/tflite-to-openvino-with-output_16_1.jpg delete mode 100644 docs/notebooks/tflite-to-openvino-with-output_files/tflite-to-openvino-with-output_16_1.png delete mode 100644 docs/notebooks/tiny-sd-image-generation-with-output.rst delete mode 100644 docs/notebooks/tiny-sd-image-generation-with-output_files/tiny-sd-image-generation-with-output_35_1.jpg delete mode 100644 docs/notebooks/tiny-sd-image-generation-with-output_files/tiny-sd-image-generation-with-output_35_1.png delete mode 100644 docs/notebooks/tiny-sd-image-generation-with-output_files/tiny-sd-image-generation-with-output_39_1.jpg delete mode 100644 docs/notebooks/tiny-sd-image-generation-with-output_files/tiny-sd-image-generation-with-output_39_1.png delete mode 100644 docs/notebooks/tiny-sd-image-generation-with-output_files/tiny-sd-image-generation-with-output_41_1.jpg delete mode 100644 docs/notebooks/tiny-sd-image-generation-with-output_files/tiny-sd-image-generation-with-output_41_1.png delete mode 100644 docs/notebooks/vehicle-detection-and-recognition-with-output.rst delete mode 100644 docs/notebooks/vehicle-detection-and-recognition-with-output_files/vehicle-detection-and-recognition-with-output_14_0.png delete mode 100644 docs/notebooks/vehicle-detection-and-recognition-with-output_files/vehicle-detection-and-recognition-with-output_21_0.png delete mode 100644 docs/notebooks/vehicle-detection-and-recognition-with-output_files/vehicle-detection-and-recognition-with-output_27_0.png delete mode 100644 docs/notebooks/vision-background-removal-with-output.rst delete mode 100644 docs/notebooks/vision-background-removal-with-output_files/vision-background-removal-with-output_22_0.png delete mode 100644 docs/notebooks/vision-background-removal-with-output_files/vision-background-removal-with-output_24_0.png delete mode 100644 docs/notebooks/vision-monodepth-with-output.rst delete mode 100644 docs/notebooks/vision-monodepth-with-output_files/vision-monodepth-with-output_18_0.png delete mode 100644 docs/notebooks/wav2lip-with-output.rst delete mode 100644 docs/notebooks/whisper-asr-genai-with-output.rst delete mode 100644 docs/notebooks/whisper-subtitles-generation-with-output.rst delete mode 100644 docs/notebooks/yolov10-optimization-with-output.rst delete mode 100644 docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_13_1.jpg delete mode 100644 docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_13_1.png delete mode 100644 docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_19_0.jpg delete mode 100644 docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_19_0.png delete mode 100644 docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_34_0.jpg delete mode 100644 docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_34_0.png delete mode 100644 docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_38_0.jpg delete mode 100644 docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_38_0.png delete mode 100644 docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_50_0.png delete mode 100644 docs/notebooks/yolov11-instance-segmentation-with-output.rst delete mode 100644 docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_10_1.jpg delete mode 100644 docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_10_1.png delete mode 100644 docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_19_1.jpg delete mode 100644 docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_19_1.png delete mode 100644 docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_34_1.jpg delete mode 100644 docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_34_1.png delete mode 100644 docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_46_0.png delete mode 100644 docs/notebooks/yolov11-keypoint-detection-with-output.rst delete mode 100644 docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_16_1.jpg delete mode 100644 docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_16_1.png delete mode 100644 docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_30_1.jpg delete mode 100644 docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_30_1.png delete mode 100644 docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_43_0.png delete mode 100644 docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_9_3.jpg delete mode 100644 docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_9_3.png delete mode 100644 docs/notebooks/yolov11-object-detection-with-output.rst delete mode 100644 docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_10_1.jpg delete mode 100644 docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_10_1.png delete mode 100644 docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_17_1.jpg delete mode 100644 docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_17_1.png delete mode 100644 docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_30_1.jpg delete mode 100644 docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_30_1.png delete mode 100644 docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_43_0.png delete mode 100644 docs/notebooks/yolov11-quantization-with-accuracy-control-with-output.rst delete mode 100644 docs/notebooks/yolov8-instance-segmentation-with-output.rst delete mode 100644 docs/notebooks/yolov8-instance-segmentation-with-output_files/yolov8-instance-segmentation-with-output_18_1.png delete mode 100644 docs/notebooks/yolov8-instance-segmentation-with-output_files/yolov8-instance-segmentation-with-output_46_1.png delete mode 100644 docs/notebooks/yolov8-instance-segmentation-with-output_files/yolov8-instance-segmentation-with-output_62_0.png delete mode 100644 docs/notebooks/yolov8-instance-segmentation-with-output_files/yolov8-instance-segmentation-with-output_9_3.png delete mode 100644 docs/notebooks/yolov8-keypoint-detection-with-output.rst delete mode 100644 docs/notebooks/yolov8-keypoint-detection-with-output_files/yolov8-keypoint-detection-with-output_16_1.png delete mode 100644 docs/notebooks/yolov8-keypoint-detection-with-output_files/yolov8-keypoint-detection-with-output_44_1.png delete mode 100644 docs/notebooks/yolov8-keypoint-detection-with-output_files/yolov8-keypoint-detection-with-output_60_0.png delete mode 100644 docs/notebooks/yolov8-keypoint-detection-with-output_files/yolov8-keypoint-detection-with-output_9_3.png delete mode 100644 docs/notebooks/yolov8-obb-with-output.rst delete mode 100644 docs/notebooks/yolov8-obb-with-output_files/yolov8-obb-with-output_10_1.png delete mode 100644 docs/notebooks/yolov8-obb-with-output_files/yolov8-obb-with-output_20_1.png delete mode 100644 docs/notebooks/yolov8-object-detection-with-output.rst delete mode 100644 docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_16_1.png delete mode 100644 docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_43_1.png delete mode 100644 docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_70_0.png delete mode 100644 docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_76_0.png delete mode 100644 docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_9_3.png delete mode 100644 docs/notebooks/yolov9-optimization-with-output.rst delete mode 100644 docs/notebooks/yolov9-optimization-with-output_files/yolov9-optimization-with-output_16_0.jpg delete mode 100644 docs/notebooks/yolov9-optimization-with-output_files/yolov9-optimization-with-output_16_0.png delete mode 100644 docs/notebooks/yolov9-optimization-with-output_files/yolov9-optimization-with-output_27_0.jpg delete mode 100644 docs/notebooks/yolov9-optimization-with-output_files/yolov9-optimization-with-output_27_0.png delete mode 100644 docs/notebooks/zeroscope-text2video-with-output.rst diff --git a/README.md b/README.md index ea242b696bcfdc..86b30f0f8ced22 100644 --- a/README.md +++ b/README.md @@ -102,7 +102,7 @@ OpenVINO supports the CPU, GPU, and NPU [devices](https://docs.openvino.ai/2025/ Get started with the OpenVINO GenAI [installation](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-genai.html) and refer to the [detailed guide](https://docs.openvino.ai/2025/openvino-workflow-generative/generative-inference.html) to explore the capabilities of Generative AI using OpenVINO. -Learn how to run LLMs and GenAI with [Samples](https://github.com/openvinotoolkit/openvino.genai/tree/master/samples) in the [OpenVINO™ GenAI repo](https://github.com/openvinotoolkit/openvino.genai). See GenAI in action with Jupyter notebooks: [LLM-powered Chatbot](https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/llm-chatbot/README.md) and [LLM Instruction-following pipeline](https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/llm-question-answering/README.md). +Learn how to run LLMs and GenAI with [Samples](https://github.com/openvinotoolkit/openvino.genai/tree/master/samples) in the [OpenVINO™ GenAI repo](https://github.com/openvinotoolkit/openvino.genai). See GenAI in action with Jupyter notebooks: [LLM-powered Chatbot](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) and [LLM Instruction-following pipeline](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-question-answering). ## Documentation diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index cdf601602795d7..a39ca10aef6305 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -70,15 +70,6 @@ function(build_docs) list(APPEND commands COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "FINISHED preprocessing OpenVINO Python API") endif() - if(${ENABLE_NOTEBOOKS}) - set(NBDOC_SCRIPT "${DOCS_SOURCE_DIR}/nbdoc/nbdoc.py") - list(PREPEND commands COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "STARTED preprocessing OpenVINO notebooks") - list(PREPEND commands - COMMAND ${Python3_EXECUTABLE} "${NBDOC_SCRIPT}" "${DOCS_SOURCE_DIR}/notebooks" "${SPHINX_SOURCE_DIR}/notebooks" - ) - list(PREPEND commands COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "FINISHED preprocessing OpenVINO notebooks") - endif() - if(${ENABLE_OVMS}) list(APPEND commands COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "STARTED preprocessing OVMS") list(APPEND commands COMMAND ${Python3_EXECUTABLE} ${FILE_HELPER_SCRIPT} diff --git a/docs/articles_en/about-openvino/openvino-ecosystem/openvino-integrations.rst b/docs/articles_en/about-openvino/openvino-ecosystem/openvino-integrations.rst index 16283402a68c31..9b49cc780a819e 100644 --- a/docs/articles_en/about-openvino/openvino-ecosystem/openvino-integrations.rst +++ b/docs/articles_en/about-openvino/openvino-ecosystem/openvino-integrations.rst @@ -27,7 +27,7 @@ OpenVINO™ Integrations .. grid-item:: * :doc:`Run inference with HuggingFace and Optimum Intel <../../openvino-workflow-generative/inference-with-optimum-intel>` - * `A notebook example: llm-chatbot `__ + * `A notebook example: llm-chatbot `__ * `Hugging Face Inference documentation `__ * `Hugging Face Compression documentation `__ * `Hugging Face Reference Documentation `__ diff --git a/docs/articles_en/get-started.rst b/docs/articles_en/get-started.rst index a9302fbb05272f..097091c5b32ef9 100644 --- a/docs/articles_en/get-started.rst +++ b/docs/articles_en/get-started.rst @@ -21,7 +21,7 @@ GET STARTED

Welcome to OpenVINO! This guide introduces installation and learning materials for Intel® Distribution of OpenVINO™ toolkit. The guide walks through the following steps:
- Quick Start Example + Quick Start Example Install OpenVINO Learn OpenVINO

@@ -40,7 +40,8 @@ For a quick reference, check out .. image:: https://user-images.githubusercontent.com/15709723/127752390-f6aa371f-31b5-4846-84b9-18dd4f662406.gif :width: 400 -Try out OpenVINO's capabilities with this `quick start example `__ that estimates depth in a scene using an OpenVINO monodepth model to quickly see how to load a model, prepare an image, inference the image, and display the result. +Try out OpenVINO's capabilities with this `quick start example `__ +that estimates depth in a scene using an OpenVINO monodepth model to quickly see how to load a model, prepare an image, inference the image, and display the result. .. _install-openvino-gsg: @@ -70,10 +71,12 @@ Interactive Tutorials - Jupyter Notebooks Start with :doc:`interactive Python ` that show the basics of model inference, the OpenVINO API, how to convert models to OpenVINO format, and more. -* `Hello Image Classification `__ - Load an image classification model in OpenVINO and use it to apply a label to an image -* `OpenVINO Runtime API Tutorial `__ - Learn the basic Python API for working with models in OpenVINO -* `Convert TensorFlow Models to OpenVINO `__ -* `Convert PyTorch Models to OpenVINO `__ +* `Hello Image Classification `__ + - Load an image classification model in OpenVINO and use it to apply a label to an image +* `OpenVINO Runtime API Tutorial `__ + - Learn the basic Python API for working with models in OpenVINO +* `Convert TensorFlow Models to OpenVINO `__ +* `Convert PyTorch Models to OpenVINO `__ .. _code-samples: @@ -101,8 +104,8 @@ Model Compression and Quantization Use OpenVINO’s model compression tools to reduce your model’s latency and memory footprint while maintaining good accuracy. -* Tutorial - `Quantization-Aware Training in TensorFlow with OpenVINO NNCF `__ -* Tutorial - `Quantization-Aware Training in PyTorch with NNCF `__ +* Tutorial - `Quantization-Aware Training in TensorFlow with OpenVINO NNCF `__ +* Tutorial - `Quantization-Aware Training in PyTorch with NNCF `__ * :doc:`Model Optimization Guide ` Automated Device Configuration diff --git a/docs/articles_en/get-started/install-openvino/install-openvino-archive-linux.rst b/docs/articles_en/get-started/install-openvino/install-openvino-archive-linux.rst index 226cacbc3af74e..943c7ab6f86804 100644 --- a/docs/articles_en/get-started/install-openvino/install-openvino-archive-linux.rst +++ b/docs/articles_en/get-started/install-openvino/install-openvino-archive-linux.rst @@ -218,7 +218,7 @@ Learn more about how to integrate a model in OpenVINO applications by trying out .. tab-item:: Get started with Python :sync: get-started-py - Try the `Python Quick Start Example <../../notebooks/vision-monodepth-with-output.html>`__ + Try the `Python Quick Start Example `__ to estimate depth in a scene using an OpenVINO monodepth model in a Jupyter Notebook inside your web browser. .. image:: https://user-images.githubusercontent.com/15709723/127752390-f6aa371f-31b5-4846-84b9-18dd4f662406.gif @@ -226,9 +226,9 @@ Learn more about how to integrate a model in OpenVINO applications by trying out Visit the :doc:`Tutorials <../../../get-started/learn-openvino/interactive-tutorials-python>` page for more Jupyter Notebooks to get you started with OpenVINO, such as: - * `OpenVINO Python API Tutorial <../../notebooks/openvino-api-with-output.html>`__ - * `Basic image classification program with Hello Image Classification <../../notebooks/hello-world-with-output.html>`__ - * `Convert a PyTorch model and use it for image background removal <../../notebooks/vision-background-removal-with-output.html>`__ + * `OpenVINO Python API Tutorial `__ + * `Basic image classification program with Hello Image Classification `__ + * `Convert a PyTorch model and use it for image background removal `__ .. tab-item:: Get started with C++ diff --git a/docs/articles_en/get-started/install-openvino/install-openvino-archive-macos.rst b/docs/articles_en/get-started/install-openvino/install-openvino-archive-macos.rst index 10bd9d36185d11..1c8d669ad02893 100644 --- a/docs/articles_en/get-started/install-openvino/install-openvino-archive-macos.rst +++ b/docs/articles_en/get-started/install-openvino/install-openvino-archive-macos.rst @@ -140,16 +140,16 @@ Now that you've installed OpenVINO Runtime, you're ready to run your own machine .. tab-item:: Get started with Python :sync: get-started-py - Try the `Python Quick Start Example <../../notebooks/vision-monodepth-with-output.html>`__ to estimate depth in a scene using an OpenVINO monodepth model in a Jupyter Notebook inside your web browser. + Try the `Python Quick Start Example `__ to estimate depth in a scene using an OpenVINO monodepth model in a Jupyter Notebook inside your web browser. .. image:: https://user-images.githubusercontent.com/15709723/127752390-f6aa371f-31b5-4846-84b9-18dd4f662406.gif :width: 400 Visit the :doc:`Tutorials <../../../get-started/learn-openvino/interactive-tutorials-python>` page for more Jupyter Notebooks to get you started with OpenVINO, such as: - * `OpenVINO Python API Tutorial <../../notebooks/openvino-api-with-output.html>`__ - * `Basic image classification program with Hello Image Classification <../../notebooks/hello-world-with-output.html>`__ - * `Convert a PyTorch model and use it for image background removal <../../notebooks/vision-background-removal-with-output.html>`__ + * `OpenVINO Python API Tutorial `__ + * `Basic image classification program with Hello Image Classification `__ + * `Convert a PyTorch model and use it for image background removal `__ .. tab-item:: Get started with C++ :sync: get-started-cpp diff --git a/docs/articles_en/get-started/install-openvino/install-openvino-archive-windows.rst b/docs/articles_en/get-started/install-openvino/install-openvino-archive-windows.rst index c8a2e400afd1fb..e23247290a096a 100644 --- a/docs/articles_en/get-started/install-openvino/install-openvino-archive-windows.rst +++ b/docs/articles_en/get-started/install-openvino/install-openvino-archive-windows.rst @@ -156,16 +156,16 @@ Now that you've installed OpenVINO Runtime, you're ready to run your own machine .. tab-item:: Get started with Python :sync: get-started-py - Try the `Python Quick Start Example <../../notebooks/vision-monodepth-with-output.html>`__ to estimate depth in a scene using an OpenVINO monodepth model in a Jupyter Notebook inside your web browser. + Try the `Python Quick Start Example `__ to estimate depth in a scene using an OpenVINO monodepth model in a Jupyter Notebook inside your web browser. .. image:: https://user-images.githubusercontent.com/15709723/127752390-f6aa371f-31b5-4846-84b9-18dd4f662406.gif :width: 400 Visit the :doc:`Tutorials <../../../get-started/learn-openvino/interactive-tutorials-python>` page for more Jupyter Notebooks to get you started with OpenVINO, such as: - * `OpenVINO Python API Tutorial <../../notebooks/openvino-api-with-output.html>`__ - * `Basic image classification program with Hello Image Classification <../../notebooks/hello-world-with-output.html>`__ - * `Convert a PyTorch model and use it for image background removal <../../notebooks/vision-background-removal-with-output.html>`__ + * `OpenVINO Python API Tutorial `__ + * `Basic image classification program with Hello Image Classification `__ + * `Convert a PyTorch model and use it for image background removal `__ .. tab-item:: Get started with C++ :sync: get-started-cpp diff --git a/docs/articles_en/get-started/install-openvino/install-openvino-genai.rst b/docs/articles_en/get-started/install-openvino/install-openvino-genai.rst index c2ef4ca687fe26..89944c1523ba04 100644 --- a/docs/articles_en/get-started/install-openvino/install-openvino-genai.rst +++ b/docs/articles_en/get-started/install-openvino/install-openvino-genai.rst @@ -9,9 +9,9 @@ and returns the generated content. For a quickstart guide, refer to the :doc:`GenAI API Guide <../../openvino-workflow-generative/inference-with-genai>`. To see OpenVINO GenAI in action, check these Jupyter notebooks: -`LLM-powered Chatbot `__ +`LLM-powered Chatbot `__ and -`LLM Instruction-following pipeline `__. +`LLM Instruction-following pipeline `__. OpenVINO GenAI is available for installation via PyPI and Archive distributions. A `detailed guide `__ diff --git a/docs/articles_en/get-started/install-openvino/install-openvino-pip.rst b/docs/articles_en/get-started/install-openvino/install-openvino-pip.rst index bf0965e3edf7fc..4311f14dafe4ed 100644 --- a/docs/articles_en/get-started/install-openvino/install-openvino-pip.rst +++ b/docs/articles_en/get-started/install-openvino/install-openvino-pip.rst @@ -141,7 +141,7 @@ the following tutorials. .. image:: https://user-images.githubusercontent.com/15709723/127752390-f6aa371f-31b5-4846-84b9-18dd4f662406.gif :width: 400 -Try the `Python Quick Start Example `__ +Try the `Python Quick Start Example `__ to estimate depth in a scene using an OpenVINO monodepth model in a Jupyter Notebook inside your web browser. @@ -152,9 +152,9 @@ Get started with Python Visit the :doc:`Tutorials <../../../get-started/learn-openvino/interactive-tutorials-python>` page for more Jupyter Notebooks to get you started with OpenVINO, such as: -* `OpenVINO Python API Tutorial `__ -* `Basic image classification program with Hello Image Classification `__ -* `Convert a PyTorch model and use it for image background removal `__ +* `OpenVINO Python API Tutorial `__ +* `Basic image classification program with Hello Image Classification `__ +* `Convert a PyTorch model and use it for image background removal `__ diff --git a/docs/articles_en/get-started/learn-openvino/interactive-tutorials-python.rst b/docs/articles_en/get-started/learn-openvino/interactive-tutorials-python.rst index f8b50e489488fa..3fb3dcd5f3145f 100644 --- a/docs/articles_en/get-started/learn-openvino/interactive-tutorials-python.rst +++ b/docs/articles_en/get-started/learn-openvino/interactive-tutorials-python.rst @@ -14,17 +14,13 @@ Interactive Tutorials (Python) :hidden: interactive-tutorials-python/notebooks-installation + interactive-tutorials-python/run-notebooks -Jupyter notebooks show how to use various OpenVINO features to run optimized deep learning -inference with Python. Notebooks with |binder logo| and |colab logo| buttons can be run in the -browser, no installation required. Just choose a tutorial and click the button. - -`Binder `__ and `Google Colab `__ -are free online services with limited resources. For the best performance -and more control, you should run the notebooks locally. Follow the -:doc:`Installation Guide ` in order to get information -on how to run and manage the notebooks on your system. +The tutorials show how to use various OpenVINO Python API features to run +optimized deep learning inference. They are not maintained on this website, +however, you can use the selector below to reach Jupyter notebooks from the +`openvino_notebooks `__ repository. .. raw:: html @@ -32,10 +28,24 @@ on how to run and manage the notebooks on your system. -.. note:: - If you have any issues with the notebooks, refer to the **Troubleshooting** and **FAQ** - sections in the :doc:`Installation Guide ` or start a GitHub - `discussion `__. +Notebooks with |binder logo| and |colab logo| buttons can be run in the +browser, no installation required. Just choose a tutorial and click the button. + +`Binder `__ and `Google Colab `__ +are free online services with limited resources. For the best performance +and more control, you should run the notebooks locally. Follow the +:doc:`Installation Guide ` in order +to get information on how to run and manage the notebooks on your system. + + +If you have any issues with the notebooks, refer to the +`Troubleshooting `__ +and +`FAQ `__ +sections in the +`Installation Guide `__ +or start a GitHub +`discussion `__. Additional Resources diff --git a/docs/articles_en/get-started/learn-openvino/interactive-tutorials-python/notebooks-installation.rst b/docs/articles_en/get-started/learn-openvino/interactive-tutorials-python/notebooks-installation.rst index ba7859a0c9f5d1..7842383c8901b9 100644 --- a/docs/articles_en/get-started/learn-openvino/interactive-tutorials-python/notebooks-installation.rst +++ b/docs/articles_en/get-started/learn-openvino/interactive-tutorials-python/notebooks-installation.rst @@ -1,5 +1,5 @@ -Installation of OpenVINO™ Notebooks -===================================== +Install Notebooks +=============================================================================================== .. meta:: :description: An installation guide for Jupyter notebooks on which Python @@ -8,18 +8,16 @@ Installation of OpenVINO™ Notebooks The notebooks can be run in various environments. This guide will show you -how to run and manage them on your local system. +how to install them on your local system. Contents: - `Installation Guide <#installation-guide>`__ -- `Run the Notebooks <#run-the-notebooks>`__ -- `Manage the notebooks <#manage-the-notebooks>`__ - `Troubleshooting <#troubleshooting>`__ Installation Guide -################## +############################################################################################### The table below lists the supported operating systems and Python versions. @@ -48,7 +46,7 @@ OpenVINO Notebooks also require Git. Follow the guide below for your operating system or environment. Installing prerequisites -+++++++++++++++++++++++++++ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ .. tab-set:: @@ -59,58 +57,89 @@ Installing prerequisites Download 64 bit version of Python software (3.9 - 3.12) from `python.org `__ - Run the installer by double clicking it. Follow the installation steps to set up the software. + Run the installer by double clicking it. Follow the installation steps to set + up the software. While installing, make sure you check the box to *add Python to system PATH*. Also, it is recommended to use the installer option to disable the PATH length limit. .. note:: - Python software available in the Microsoft Store is not recommended. It may require additional packages. + Python software available in the Microsoft Store is not recommended. It may + require additional packages. 2. **Install GIT** - Download 64 bit version of GIT from `git-scm.org `__ + Download 64 bit version of GIT from + `git-scm.org `__ - Run the installer by double clicking it. Follow the installation steps to set up the software. + Run the installer by double clicking it. Follow the installation steps to set + up the software. - 3. Install FFMPEG (Optional) + 3. **Install Drivers for GPU, and NPU (AI PC)** + + It is recommended to perform a "Clean Install" of the + `WHQL Certified GPU driver `__ + to ensure the underlying libraries are correctly configured. + + For AI PC, install the latest + `Intel® NPU driver `__ + (or last known working driver - + `Windows 32.0.100.3053 `__ + ) to avoid any potential issues in compiling NPU kernels. + + 4. **Install C++ Redistributable (Required)** + + * Download + `Microsoft Visual C++ Redistributable `__. + * Run the installer and follow the instructions. + + 5. Install FFMPEG (Optional) Download FFMPEG binary from `here `__ - Set FFMPEG's path (e.g., ``C:\ffmpeg\bin``) to the PATH environmental variable on Windows. + Set FFMPEG's path (e.g., ``C:\ffmpeg\bin``) to the PATH environmental + variable on Windows. .. tab-item:: Linux :sync: linux - 4. **Install Python and GIT** + 1. **Install Python and GIT** .. note:: Linux Systems may require installation of additional libraries. - The following installation steps should work on a clean install of Ubuntu Desktop 20.04, and should also work on Ubuntu 22.04 and 20.10, and on Ubuntu Server. + The following installation steps should work on a clean install of Ubuntu + Desktop 20.04, and should also work on Ubuntu 22.04 and 20.10, and on Ubuntu Server. - .. code-block:: sh + .. code-block:: console sudo apt-get update sudo apt-get upgrade sudo apt-get install python3-venv build-essential python3-dev git-all libgl1-mesa-dev ffmpeg - For an Intel Integrated Graphics Card, you can install the `Intel Graphics Compute Runtime `__ to enable inference on this device. The command for Ubuntu 20.04 is: + For an Intel Integrated Graphics Card, you can install the + `Intel Graphics Compute Runtime `__ + to enable inference on this device. The command for Ubuntu 20.04 is: .. note:: - Only execute this command if you do not yet have OpenCL drivers installed. + Execute this command only if you have not installed OpenCL drivers yet: - .. code-block:: sh + .. code-block:: console - sudo apt-get install intel-opencl-icd + sudo apt-get install intel-opencl-icd + Follow the instructions discussed + `here `__ + to make sure the right permissions are enabled. - The following installation steps should work on a clean install of Red Hat, CentOS, Amazon Linux 2 or Fedora. If any issues occur, see the `Troubleshooting <#-troubleshooting>`__ section. + The following installation steps should work on a clean install of Red Hat, + CentOS, Amazon Linux 2 or Fedora. If any issues occur, see the + `Troubleshooting <#-troubleshooting>`__ section. - .. code-block:: sh + .. code-block:: console sudo yum update sudo yum upgrade @@ -119,25 +148,27 @@ Installing prerequisites .. tab-item:: macOS :sync: macos - Alternatively, you may skip steps 1-3 if you prefer to manually install `Python 3 `__ and `Git `__. + Alternatively, you may skip steps 1-3 if you prefer to manually install + `Python 3 `__ and `Git `__. 1. **Install Xcode Command Line Tools** - .. code-block:: sh + .. code-block:: console xcode-select --install 2. **Install Homebrew** - .. code-block:: sh + .. code-block:: console /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" - After you install it, follow the instructions from the Homebrew installation to set it up. + After you install it, follow the instructions from the Homebrew installation + to set it up. 3. **Install Python and dependencies** - .. code-block:: sh + .. code-block:: console brew install python@3.9 brew install protobuf @@ -149,24 +180,30 @@ Installing prerequisites .. note:: - If OpenVINO is installed globally, do not run any of these commands in a terminal where ``setupvars.sh`` is sourced. + If OpenVINO is installed globally, do not run any of these commands in a + terminal where ``setupvars.sh`` is sourced. .. tab-item:: Azure ML :sync: azure-ml .. note:: - An Azure account and access to `Azure ML Studio `__ are required. + An Azure account and access to `Azure ML Studio `__ + are required. 1. **Adding a Compute Instance** - In Azure ML Studio, `add a compute instance `__ and pick any CPU-based instance. At least 4 CPU cores and 8GB of RAM are recommended. + In Azure ML Studio, + `add a compute instance `__ + and pick any CPU-based instance. At least 4 CPU cores and 8GB of RAM + are recommended. |ml-studio-1| 2. **Start the Terminal** - Once the compute instance has started, open the terminal window and then follow the installation steps below. + Once the compute instance has started, open the terminal window and then + follow the installation steps below. |ml-studio-2| @@ -175,8 +212,7 @@ Installing prerequisites To run the notebooks inside a Linux-based Docker container, use the Dockerfile: - .. code-block:: sh - + .. code-block:: console :caption: Source: https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/Dockerfile FROM quay.io/thoth-station/s2i-thoth-ubi8-py38:v0.29.0 @@ -204,23 +240,24 @@ Installing prerequisites # Upgrade NodeJS > 12.0 # Install dos2unix for line end conversion on Windows - RUN curl -sL https://rpm.nodesource.com/setup_14.x | bash - && \ - yum remove -y nodejs && \ - yum install -y nodejs-14.18.1 mesa-libGL dos2unix libsndfile && \ - yum -y update-minimal --security --sec-severity=Important --sec-severity=Critical --sec-severity=Moderate + RUN dnf --disableplugin=subscription-manager remove -y nodejs && \ + dnf --disableplugin=subscription-manager module -y reset nodejs && \ + dnf --disableplugin=subscription-manager module -y enable nodejs:20 && \ + dnf --disableplugin=subscription-manager install -y nodejs mesa-libGL dos2unix libsndfile && \ + dnf --disableplugin=subscription-manager -y update-minimal --security --sec-severity=Important --sec-severity=Critical --sec-severity=Moderate # GPU drivers - RUN dnf install -y 'dnf-command(config-manager)' && \ - dnf config-manager --add-repo https://repositories.intel.com/graphics/rhel/8.5/intel-graphics.repo + RUN dnf --disableplugin=subscription-manager install -y 'dnf-command(config-manager)' && \ + dnf --disableplugin=subscription-manager config-manager --add-repo https://repositories.intel.com/gpu/rhel/8.6/lts/2350/unified/intel-gpu-8.6.repo RUN rpm -ivh https://vault.centos.org/centos/8/AppStream/x86_64/os/Packages/mesa-filesystem-21.1.5-1.el8.x86_64.rpm && \ - dnf install --refresh -y \ - intel-opencl-22.28.23726.1-i419.el8.x86_64 intel-media intel-mediasdk libmfxgen1 libvpl2 \ + dnf --disableplugin=subscription-manager install --refresh -y \ + intel-opencl intel-media intel-mediasdk libmfxgen1 libvpl2 \ level-zero intel-level-zero-gpu \ intel-metrics-library intel-igc-core intel-igc-cm \ libva libva-utils intel-gmmlib && \ rpm -ivh http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/ocl-icd-2.2.12-1.el8.x86_64.rpm && \ - rpm -ivh https://download-ib01.fedoraproject.org/pub/epel/8/Everything/x86_64/Packages/c/clinfo-3.0.21.02.21-4.el8.x86_64.rpm + rpm -ivh https://dl.fedoraproject.org/pub/epel/8/Everything/x86_64/Packages/c/clinfo-3.0.21.02.21-4.el8.x86_64.rpm # Copying in override assemble/run scripts COPY .docker/.s2i/bin /tmp/scripts @@ -294,7 +331,7 @@ Installing prerequisites 7. **Select a SageMaker image.** - Choose ``Data Science 3.0`` in "Select a SageMaker image" drop-down under + Choose ``Data Science 3.0`` in the "Select a SageMaker image" drop-down, under "Notebooks and compute resources". Then, click **+** on "Image Terminal" to start a terminal session: @@ -303,72 +340,77 @@ Installing prerequisites Installing notebooks -++++++++++++++++++++ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ .. tab-set:: .. tab-item:: Windows :sync: windows + .. important:: + + * Use Command Prompt (cmd.exe), not PowerShell, to run the commands below. + * If OpenVINO is installed globally, do not run any of the following commands + in a terminal, where ``setupvars.bat`` is sourced. + 1. **Create a Virtual Environment** - .. code-block:: sh + .. code-block:: console python -m venv openvino_env 2. **Activate the Environment** - .. code-block:: sh + .. code-block:: console openvino_env\Scripts\activate 3. **Clone the Repository** - Using the --depth=1 option for git clone reduces download size. + Use the ``--depth=1`` option for git cloning to reduce the download size. - .. code-block:: sh + .. code-block:: console git clone --depth=1 https://github.com/openvinotoolkit/openvino_notebooks.git cd openvino_notebooks 4. **Upgrade PIP** - .. code-block:: sh + .. code-block:: console python -m pip install --upgrade pip wheel setuptools 5. **Install required packages** - .. code-block:: sh + .. code-block:: console pip install -r requirements.txt + .. important:: - .. important:: - - In case of problems with accessing HuggingFace in PRC, set-up the networking - environment before you launch the notebooks: + In case of problems with accessing HuggingFace in PRC, set-up the networking + environment before you launch the notebooks: - .. code-block:: + .. code-block:: - pip install -U huggingface_hub - set HF_ENDPOINT = https://hf-mirror.com + pip install -U huggingface_hub + set HF_ENDPOINT = https://hf-mirror.com - For more information, visit `HF-Mirror HuggingFace `__. + For more information, visit `HF-Mirror HuggingFace `__. .. tab-item:: Linux :sync: linux 1. **Create a Virtual Environment** - .. code-block:: sh + .. code-block:: console python3 -m venv openvino_env 2. **Activate the Environment** - .. code-block:: sh + .. code-block:: console source openvino_env/bin/activate @@ -376,21 +418,21 @@ Installing notebooks Using the --depth=1 option for git clone reduces download size. - .. code-block:: sh + .. code-block:: console git clone --depth=1 https://github.com/openvinotoolkit/openvino_notebooks.git cd openvino_notebooks 4. **Upgrade PIP** - .. code-block:: sh + .. code-block:: console python -m pip install --upgrade pip pip install wheel setuptools 5. **Install required packages** - .. code-block:: sh + .. code-block:: console pip install -r requirements.txt @@ -411,13 +453,13 @@ Installing notebooks 1. **Create a Virtual Environment** - .. code-block:: sh + .. code-block:: console python3 -m venv openvino_env 2. **Activate the Environment** - .. code-block:: sh + .. code-block:: console source openvino_env/bin/activate @@ -425,20 +467,20 @@ Installing notebooks Using the --depth=1 option for git clone reduces download size. - .. code-block:: sh + .. code-block:: console git clone --depth=1 https://github.com/openvinotoolkit/openvino_notebooks.git cd openvino_notebooks 4. **Upgrade PIP** - .. code-block:: sh + .. code-block:: console python -m pip install --upgrade pip wheel setuptools 5. **Install required packages** - .. code-block:: sh + .. code-block:: console pip install -r requirements.txt @@ -448,56 +490,62 @@ Installing notebooks 1. Create a Conda environment - .. code-block:: sh + .. code-block:: console conda create --name openvino_env python=3.9 -y 2. Activate the environment - .. code-block:: sh + .. code-block:: console conda activate openvino_env 3. Clone OpenVINO notebooks - .. code-block:: sh + .. code-block:: console git clone https://github.com/openvinotoolkit/openvino_notebooks.git 4. Change directory to ``openvino_notebooks`` - .. code-block:: sh + .. code-block:: console cd openvino_notebooks 5. Upgrade ``pip`` and install required dependencies. - .. code-block:: sh + .. code-block:: console python -m pip install --upgrade pip pip install -r requirements.txt 6. Add ``openvino_env`` to PATH - .. code-block:: sh + .. code-block:: console set PATH="/anaconda/envs/openvino_env/bin;%PATH%" 7. Run the notebooks. - To run the notebooks, click on Notebooks and refresh your Files: + a. To run the notebooks, click on "Notebooks" and refresh your "Files": - .. image:: https://user-images.githubusercontent.com/15709723/117580814-a725c300-b0ae-11eb-93bf-007779c26075.png + .. image:: https://user-images.githubusercontent.com/15709723/117580814-a725c300-b0ae-11eb-93bf-007779c26075.png - .. image:: https://user-images.githubusercontent.com/15709723/117559447-2af19800-b03a-11eb-8bd6-8813b7a8814f.png + b. Select a notebook: - .. image:: https://user-images.githubusercontent.com/15709723/117580973-37640800-b0af-11eb-91ae-7194b9b4e505.png + .. image:: https://user-images.githubusercontent.com/15709723/117559447-2af19800-b03a-11eb-8bd6-8813b7a8814f.png - .. note:: + c. Next, run all cells: + + .. image:: https://user-images.githubusercontent.com/15709723/117580973-37640800-b0af-11eb-91ae-7194b9b4e505.png + + d. Happy coding! - Make sure you are using the ``openvino_env`` environment (not Python 3). + .. important:: + + Make sure you are using the ``openvino_env`` environment (not Python 3). - .. image:: https://user-images.githubusercontent.com/1720147/162269003-7937b47c-484f-416c-97c7-bb869376ff68.png + .. image:: https://user-images.githubusercontent.com/1720147/162269003-7937b47c-484f-416c-97c7-bb869376ff68.png .. tab-item:: Docker @@ -505,30 +553,39 @@ Installing notebooks 1. **Clone the Repository** - .. code-block:: sh + .. code-block:: console git clone https://github.com/openvinotoolkit/openvino_notebooks.git cd openvino_notebooks 2. **Build the Docker Image** - .. code-block:: sh + .. code-block:: console docker build -t openvino_notebooks . 3. **Run the Docker Image** - .. code-block:: sh + a. Command for CPU only: - docker run -it -p 8888:8888 openvino_notebooks + .. code-block:: console - .. note:: + docker run -it -p 8888:8888 openvino_notebooks - For using model training notebooks, allocate additional memory: + .. note:: - .. code-block:: sh + For using model training notebooks, allocate additional memory: - docker run -it -p 8888:8888 --shm-size 8G openvino_notebooks + .. code-block:: console + + docker run -it -p 8888:8888 --shm-size 8G openvino_notebooks + + b. Command for CPU and GPU (requires system with integrated or + discrete Intel GPU): + + .. code-block:: console + + docker run -it --device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -p 8888:8888 openvino_notebooks 4. **Start the browser** @@ -539,11 +596,15 @@ Installing notebooks The Dockerfile can be used to run a local image on Windows, Linux or macOS. It is also compatible with Open Data Hub and Red Hat OpenShift Data Science. - The base layer is a `UBI 8 `__-based image provided by `Project Thoth `__. + The base layer is a + `UBI 8 `__ + -based image provided by `Project Thoth `__. .. note:: - While running the container on Windows and macOS, only CPU devices can be used. To access the iGPU, install the notebooks locally, following the instructions above. + While running the container on Windows and macOS, only CPU devices can be + used. To access the iGPU, install the notebooks locally, following the + instructions above. .. tab-item:: Amazon SageMaker @@ -555,7 +616,7 @@ Installing notebooks |amazon-studio-7| - 1. **Install few system dependencies.** + 1. **Install system dependencies.** .. code-block:: @@ -585,151 +646,41 @@ Installing notebooks 4. **Run the Notebooks** - * To run the notebooks, click the top level "openvino_notebooks" folder - and navigate to your example: - - |amazon-studio-8| - - * Choose "Image" - ``Data Science 3.0``, - "Kernel" - ``Python [conda env:openvino_env],`` - "Instance type"- your desired compute instance. - - |amazon-studio-9| - - |amazon-studio-10| - - |amazon-studio-11| - - .. note:: - - Make sure you use the ``Python [conda env:openvino_env]`` - environment (not ``Python 3``). - - * Next, run the cells of the notebook. You may try other notebooks to - explore OpenVINO features and examples. - - -Run the Notebooks -################# - -Launch a Single Notebook -++++++++++++++++++++++++ - -If you want to launch only one notebook, such as the *Monodepth* notebook, run the command below. - -.. code:: bash - - jupyter lab notebooks/vision-monodepth/vision-monodepth.ipynb - -Launch All Notebooks -++++++++++++++++++++ - -.. code:: bash - - jupyter lab notebooks - -In your browser, select a notebook from the file browser in Jupyter Lab, using the left sidebar. Each tutorial is located in a subdirectory within the ``notebooks`` directory. - -|launch-jupyter| - + a. To run the notebooks, click the top level "openvino_notebooks" folder + and navigate to your example: -Manage the Notebooks -#################### + |amazon-studio-8| -Shut Down Jupyter Kernel -++++++++++++++++++++++++ + b. Choose "Image" - ``Data Science 3.0``, + "Kernel" - ``Python [conda env:openvino_env],`` + "Instance type"- your desired compute instance. -To end your Jupyter session, press ``Ctrl-c``. This will prompt you to -``Shutdown this Jupyter server (y/[n])?`` enter ``y`` and hit ``Enter``. + |amazon-studio-9| -Deactivate Virtual Environment -++++++++++++++++++++++++++++++ + |amazon-studio-10| -First, make sure you use the terminal window where you activated ``openvino_env``. To deactivate your ``virtualenv``, simply run: + |amazon-studio-11| -.. code:: bash + .. note:: - deactivate + Make sure you use the ``Python [conda env:openvino_env]`` + environment (not ``Python 3``). -This will deactivate your virtual environment. - -Reactivate Virtual Environment -++++++++++++++++++++++++++++++ - -To reactivate your environment, run: - -.. tab-set:: - - .. tab-item:: Windows - :sync: windows - - .. code:: bash - - source openvino_env\Scripts\activate - - .. tab-item:: Linux - :sync: linux - - .. code:: bash - - source openvino_env/bin/activate - - .. tab-item:: macOS - :sync: macos - - .. code:: bash - - source openvino_env/bin/activate - - -Then type ``jupyter lab`` or ``jupyter notebook`` to launch the notebooks again. - -Delete Virtual Environment -++++++++++++++++++++++++++ - -This operation is optional. However, if you want to remove your virtual environment, simply delete the ``openvino_env`` directory: - -.. tab-set:: - - .. tab-item:: Windows - :sync: windows - - .. code:: bash - - rmdir /s openvino_env - - .. tab-item:: Linux - :sync: linux - - .. code:: bash - - rm -rf openvino_env - - .. tab-item:: macOS - :sync: macos - - .. code:: bash - - rm -rf openvino_env - - -Remove openvino_env Kernel from Jupyter -+++++++++++++++++++++++++++++++++++++++ - -.. code:: bash - - jupyter kernelspec remove openvino_env - - -If you run into issues, check the `Troubleshooting <#-troubleshooting>`__, and `FAQs <#-faq>`__ sections or start a GitHub -`discussion `__. + c. Next, run the cells of the notebook. You may try other notebooks to + explore OpenVINO features and examples. +Great! You have completed the installation. To learn how to launch and manage the notebooks, +see the :doc:`Run Notebooks ` article. Troubleshooting -############### +############################################################################################### -For solutions to common issues during installation, refer to the `Troubleshooting `__ and -`FAQ `__ sections in `openvino_notebooks `__ repository. +For solutions to common issues during installation, refer to the +`Troubleshooting `__ +and +`FAQ `__ +sections in `openvino_notebooks `__ +repository. If the above tips do not solve your problem, feel free to open a `discussion topic `__ @@ -737,12 +688,10 @@ or create an `issue `__ on Github. Additional Resources -#################### - -* `OpenVINO™ Notebooks - Github Repository `_ +############################################################################################### +* `OpenVINO™ Notebooks - Github Repository `_ -.. |launch-jupyter| image:: https://user-images.githubusercontent.com/15709723/120527271-006fd200-c38f-11eb-9935-2d36d50bab9f.gif .. |ml-studio-1| image:: https://user-images.githubusercontent.com/15709723/117559437-17463180-b03a-11eb-9e8d-d4539d1502f2.png @@ -771,4 +720,3 @@ Additional Resources .. |amazon-studio-11| image:: https://user-images.githubusercontent.com/4837253/199812713-32074aa7-8190-43c8-815c-231542c7b286.png .. |docker-terminal-1| image:: https://user-images.githubusercontent.com/15709723/127793994-355e4d29-d131-432d-a12a-b08ca6131223.png - diff --git a/docs/articles_en/get-started/learn-openvino/interactive-tutorials-python/run-notebooks.rst b/docs/articles_en/get-started/learn-openvino/interactive-tutorials-python/run-notebooks.rst new file mode 100644 index 00000000000000..7ad05034b551d8 --- /dev/null +++ b/docs/articles_en/get-started/learn-openvino/interactive-tutorials-python/run-notebooks.rst @@ -0,0 +1,135 @@ +Run Notebooks +=============================================================================================== + +.. meta:: + :description: Run Jupyter notebooks to go through the tutorials and learn OpenVINO™ toolkit. + + +This guide will show you how to launch and manage OpenVINO™ Notebooks on your machine. +Before you proceed, make sure you have followed the +:doc:`installation steps `. + + +Launch a Single Notebook +############################################################################################### + +If you want to launch only one notebook, such as the *Monodepth* notebook, run the +command below. + +.. code:: bash + + jupyter lab notebooks/vision-monodepth/vision-monodepth.ipynb + +Launch All Notebooks +############################################################################################### + +.. code:: bash + + jupyter lab notebooks + +In your browser, select a notebook from the file browser in Jupyter Lab, using the left +sidebar. Each tutorial is located in a subdirectory within the ``notebooks`` directory. + +|launch-jupyter| + + +Manage the Notebooks +############################################################################################### + +Shut Down Jupyter Kernel ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +To end your Jupyter session, press ``Ctrl-c``. This will prompt you to +``Shutdown this Jupyter server (y/[n])?`` enter ``y`` and hit ``Enter``. + +Deactivate Virtual Environment ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +First, make sure you use the terminal window where you activated ``openvino_env``. +To deactivate your ``virtualenv``, simply run: + +.. code:: bash + + deactivate + +This will deactivate your virtual environment. + +Reactivate Virtual Environment ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +To reactivate your environment, run: + +.. tab-set:: + + .. tab-item:: Windows + :sync: windows + + .. code:: bash + + source openvino_env\Scripts\activate + + .. tab-item:: Linux + :sync: linux + + .. code:: bash + + source openvino_env/bin/activate + + .. tab-item:: macOS + :sync: macos + + .. code:: bash + + source openvino_env/bin/activate + + +Then type ``jupyter lab`` or ``jupyter notebook`` to launch the notebooks again. + +Delete Virtual Environment ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +This operation is optional. However, if you want to remove your virtual environment, +simply delete the ``openvino_env`` directory: + +.. tab-set:: + + .. tab-item:: Windows + :sync: windows + + .. code:: bash + + rmdir /s openvino_env + + .. tab-item:: Linux + :sync: linux + + .. code:: bash + + rm -rf openvino_env + + .. tab-item:: macOS + :sync: macos + + .. code:: bash + + rm -rf openvino_env + + +Remove openvino_env Kernel from Jupyter ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. code:: bash + + jupyter kernelspec remove openvino_env + + +If you run into issues, check the `Troubleshooting <#-troubleshooting>`__, and +`FAQs <#-faq>`__ sections or start a GitHub +`discussion `__. + +.. |launch-jupyter| image:: https://user-images.githubusercontent.com/15709723/120527271-006fd200-c38f-11eb-9935-2d36d50bab9f.gif + +Additional Resources +#################### + +* `OpenVINO™ Notebooks - Github Repository `_ diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training.rst index 24143675a74a59..395320487d64fd 100644 --- a/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training.rst +++ b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training.rst @@ -70,7 +70,7 @@ NNCF provides some state-of-the-art compression methods that are still in the ex stages of development and are only recommended for expert developers. These include: * Mixed-precision quantization. -* Sparsity (check out the `Sparsity-Aware Training notebook `__). +* Sparsity (check out the `Sparsity-Aware Training notebook `__). * Movement Pruning (Movement Sparsity). To learn `more about these methods `__, diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes.rst index 2d560b343c3b26..b4f3722cb262a1 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes.rst @@ -82,7 +82,7 @@ Accordingly, the code that loops over all available devices of the "GPU" type on Additional Resources #################### -* `OpenVINO™ Runtime API Tutorial <../../notebooks/openvino-api-with-output.html>`__ -* `AUTO Device Tutorial <../../notebooks/auto-device-with-output.html>`__ -* `GPU Device Tutorial <../../notebooks/gpu-device-with-output.html>`__ -* `NPU Device Tutorial <../../notebooks/hello-npu-with-output.html>`__ \ No newline at end of file +* `OpenVINO™ Runtime API Tutorial `__ +* `AUTO Device Tutorial `__ +* `GPU Device Tutorial `__ +* `NPU Device Tutorial `__ \ No newline at end of file diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection.rst index 539ed84edb120e..3a5df5faa2c550 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection.rst @@ -315,8 +315,8 @@ asynchronous inference pipeline. For information on asynchronous inference, see The following notebooks provide examples of how to set up an asynchronous pipeline: * :doc:`Image Classification Async Sample <../../../get-started/learn-openvino/openvino-samples/image-classification-async>` -* `Notebook - Asynchronous Inference with OpenVINO™ <./../../../notebooks/async-api-with-output.html>`__ -* `Notebook - Automatic Device Selection with OpenVINO <./../../../notebooks/auto-device-with-output.html>`__ +* `Notebook - Asynchronous Inference with OpenVINO™ `__ +* `Notebook - Automatic Device Selection with OpenVINO `__ LATENCY -------------------- diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.rst index 4b2915333249ce..921868d4f04ba1 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.rst @@ -474,7 +474,7 @@ Since OpenVINO relies on the OpenCL kernels for the GPU implementation, many gen Additional Resources ####################################### -* `Working with GPUs in OpenVINO™ Notebook `__ +* `Working with GPUs in OpenVINO™ Notebook `__ * :doc:`Inference Devices and Modes <../inference-devices-and-modes>`. * :doc:`Optimization guide <../optimize-inference>`. * `GPU plugin developer documentation `__ diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst index 5a3b1d7923d06e..823f6987af7874 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst @@ -282,5 +282,5 @@ guaranteed. Additional Resources ############################# -* `Working with NPUs in OpenVINO™ Notebook `__ -* `Vision colorization Notebook <./../../../notebooks/vision-image-colorization-with-output.html>`__ +* `Working with NPUs in OpenVINO™ Notebook `__ +* `Vision colorization Notebook `__ diff --git a/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application.rst b/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application.rst index 8381112683b52f..d43f81187f13b2 100644 --- a/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application.rst +++ b/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application.rst @@ -438,7 +438,7 @@ To build your project using CMake with the default build tools currently availab Additional Resources #################### -* `OpenVINO™ Runtime API Tutorial <./../../notebooks/openvino-api-with-output.html>`__ +* `OpenVINO™ Runtime API Tutorial `__ * See the :doc:`OpenVINO Samples <../../get-started/learn-openvino/openvino-samples>` page for specific examples of how OpenVINO pipelines are implemented for applications like image classification, text prediction, and many others. * Models in the OpenVINO IR format on `Hugging Face `__. * :doc:`OpenVINO™ Runtime Preprocessing ` diff --git a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimize-preprocessing/layout-api-overview.rst b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimize-preprocessing/layout-api-overview.rst index df4b47c44688c6..2577147bbc9b46 100644 --- a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimize-preprocessing/layout-api-overview.rst +++ b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimize-preprocessing/layout-api-overview.rst @@ -22,7 +22,7 @@ Below is a list of cases where input/output layout is important: * Doing the same operations as used during the model conversion phase. For more information, refer to the: * :doc:`Convert to OpenVINO <../../../model-preparation/convert-model-to-ir>` - * `OpenVINO Model Conversion Tutorial `__ + * `OpenVINO Model Conversion Tutorial `__ * Improving the readability of a model input and output. diff --git a/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst b/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst index 9597de75ffe73d..edbd94126c29ee 100644 --- a/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst +++ b/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst @@ -139,5 +139,5 @@ sequences. You can find more examples demonstrating how to work with states in other articles: -* `LLaVA-NeXT Multimodal Chatbot notebook <../../notebooks/llava-next-multimodal-chatbot-with-output.html>`__ +* `LLaVA-NeXT Multimodal Chatbot notebook `__ * :doc:`Serving Stateful Models with OpenVINO Model Server <../../model-server/ovms_docs_stateful_models>` diff --git a/docs/articles_en/openvino-workflow/running-inference/stateful-models/obtaining-stateful-openvino-model.rst b/docs/articles_en/openvino-workflow/running-inference/stateful-models/obtaining-stateful-openvino-model.rst index 48e9671f3b6d99..00628af4f18537 100644 --- a/docs/articles_en/openvino-workflow/running-inference/stateful-models/obtaining-stateful-openvino-model.rst +++ b/docs/articles_en/openvino-workflow/running-inference/stateful-models/obtaining-stateful-openvino-model.rst @@ -257,7 +257,7 @@ To apply LowLatency2 Transformation, follow the instruction below: In such a case, trim non-reshapable layers via :doc:`Conversion Parameters <../../model-preparation/conversion-parameters>`: - ``--input`` and ``--output``. For example, check the `OpenVINO Model Conversion Tutorial `__. + ``--input`` and ``--output``. For example, check the `OpenVINO Model Conversion Tutorial `__. As for the parameter and the problematic constant in the picture above, it can be trimmed by using the ``--input Reshape_layer_name`` command-line option. The problematic diff --git a/docs/nbdoc/README.md b/docs/nbdoc/README.md deleted file mode 100644 index d3d23118f3582d..00000000000000 --- a/docs/nbdoc/README.md +++ /dev/null @@ -1,48 +0,0 @@ -# Jupyter notebooks autodoc - -Auto fetching documentations designed for openvino notebooks tutorials. -This module is responsible for fetching artifacts, in this particular example jupyter tutorial notebooks and converting them to notebook documentation. - -## Step 0. Prepare venv - -To start new venv based on system interpreter - -``` bash -python -m venv ./venv -``` - -Linux: - -``` bash -source ./venv/bin/activate -``` - -Windows (cmd): - -``` bash -venv/Scripts/activate -``` - -## Step 1. Download requirements - -``` bash -python -m pip install -r requirements.txt -``` - -## Step 2. Configure consts to meet project directions - -[Consts file](consts.py) contains multiple variables that might differ for different environments. - -## Step 3. Add classes with methods to makefile or other executed file - -[Main file](main.py) contains example usecases of auto generator for notebooks. Informations placed in [main](main.py) should be further used to add it to makefile and possibly fully automate notebook documentation process. - -## Step 4. Run python file (optional) - -If step 4 was skipped use command - -``` bash -python main.py -``` - -or use any other command that is responsible for generating documentation diff --git a/docs/nbdoc/__init__.py b/docs/nbdoc/__init__.py deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/docs/nbdoc/consts.py b/docs/nbdoc/consts.py deleted file mode 100644 index 9ffa8c47e2399b..00000000000000 --- a/docs/nbdoc/consts.py +++ /dev/null @@ -1,48 +0,0 @@ -from pathlib import Path -import base64 - -notebooks_path = "notebooks" -repo_directory = "notebooks" -repo_owner = "openvinotoolkit" -repo_name = "openvino_notebooks" -repo_branch = "tree/main" -artifacts_link = "http://repository.toolbox.iotg.sclab.intel.com/projects/ov-notebook/0.1.0-latest/20250203220806/dist/rst_files/" -blacklisted_extensions = ['.xml', '.bin'] -notebooks_repo = "https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/" -notebooks_binder = "https://mybinder.org/v2/gh/openvinotoolkit/openvino_notebooks/HEAD?filepath=" -notebooks_colab = "https://colab.research.google.com/github/openvinotoolkit/openvino_notebooks/blob/latest/" -file_with_binder_notebooks = Path('../../docs/notebooks/notebooks_with_binder_buttons.txt').resolve(strict=True) -file_with_colab_notebooks = Path('../../docs/notebooks/notebooks_with_colab_buttons.txt').resolve(strict=True) -openvino_notebooks_ipynb_list = Path('../../docs/notebooks/all_notebooks_paths.txt').resolve(strict=True) -binder_image_source = Path('../../docs/articles_en/assets/images/launch_in_binder.svg').resolve(strict=True) -binder_image_source_data = open(binder_image_source, 'rb').read() -binder_image_source_data_base64 = base64.b64encode(binder_image_source_data) -binder_image_base64 = binder_image_source_data_base64.decode() -colab_image_source = Path('../../docs/articles_en/assets/images/open_in_colab.svg').resolve(strict=True) -colab_image_source_data = open(colab_image_source, 'rb').read() -colab_image_source_data_base64 = base64.b64encode(colab_image_source_data) -colab_image_base64 = colab_image_source_data_base64.decode() -github_image_source = Path('../../docs/articles_en/assets/images/view_on_github.svg').resolve(strict=True) -github_image_source_data = open(github_image_source, 'rb').read() -github_image_source_data_base64 = base64.b64encode(github_image_source_data) -github_image_base64 = github_image_source_data_base64.decode() - -binder_colab_template = """ -This Jupyter notebook can be launched on-line, opening an interactive environment in a browser window. -You can also make a |installation_link|. Choose one of the following options: - -{{ link_binder }}{{ link_colab }}{{ link_git }} - -{{ installation_link }} - -\n -""" -no_binder_template = """ -This Jupyter notebook can be launched after a |installation_link| only. - -{{ link_git }} - -{{ installation_link }} - -\n -""" diff --git a/docs/nbdoc/nbdoc.py b/docs/nbdoc/nbdoc.py deleted file mode 100644 index d93566d3e96dee..00000000000000 --- a/docs/nbdoc/nbdoc.py +++ /dev/null @@ -1,154 +0,0 @@ -import os -import argparse -import shutil -from pathlib import Path -from utils import ( - create_content, - add_content_below, - verify_notebook_name, -) -from consts import ( - binder_colab_template, - notebooks_path, - no_binder_template, - repo_directory, - repo_name, - openvino_notebooks_ipynb_list, - file_with_binder_notebooks, - file_with_colab_notebooks, - repo_owner, - notebooks_repo, - notebooks_binder, - notebooks_colab, - binder_image_base64, - colab_image_base64, - github_image_base64, -) - - -def fetch_binder_list(binder_list_file) -> list: - """Function that fetches list of notebooks with binder buttons - - :return: List of notebooks containing binder buttons - :rtype: list - """ - if binder_list_file: - with open(binder_list_file) as file: - list_of_buttons = file.read().splitlines() - return list_of_buttons - - -def fetch_colab_list(colab_list_file) -> list: - """Function that fetches list of notebooks with colab buttons - - :return: List of notebooks containing colab buttons - :rtype: list - """ - if colab_list_file: - with open(colab_list_file) as file: - list_of_cbuttons = file.read().splitlines() - return list_of_cbuttons - - -def add_glob_directive(tutorials_file): - """This function modifies toctrees of the five node articles in tutorials - section. It adds the notebooks found in docs/notebooks directory to the menu. - """ - with open(tutorials_file, 'r+', encoding='cp437') as mainfile: - readfile = mainfile.read() - if ':glob:' not in readfile: - add_glob = readfile\ - .replace(":hidden:\n", ":hidden:\n :glob:\n")\ - .replace(" interactive-tutorials-python/notebooks-installation\n", " interactive-tutorials-python/notebooks-installation\n ../../notebooks/*\n") - mainfile.seek(0) - mainfile.write(add_glob) - mainfile.truncate() - - -class NbProcessor: - def __init__(self, nb_path: str = notebooks_path): - self.nb_path = nb_path - - def add_binder(self, buttons_list: list, cbuttons_list: list, template_with_colab_and_binder: str = binder_colab_template, template_without_binder: str = no_binder_template): - """A function working as an example of how to add Binder or Google Colab buttons to existing RST files. - - :param buttons_list: A list of notebooks that work on Binder. - :type buttons_list: list - :param cbuttons_list: A list of notebooks that work on Google Colab. - :type cbuttons_list: list - :param template_with_colab_and_binder: A template with buttons added to an RST file if Binder and/or Google Colab are available. Defaults to template_with_colab_and_binder. - :type template_with_colab_and_binder: str - :param template_without_binder: A template with buttons added to an RST file if neither Binder nor Google Colab are available. Defaults to no_binder_template. - :type template_without_binder: str - :raises FileNotFoundError: In case of a failure in adding the content, an error will appear. - - """ - - if not os.path.exists(openvino_notebooks_ipynb_list): - raise FileNotFoundError("all_notebooks_paths.txt is not found") - else: - with open(openvino_notebooks_ipynb_list, 'r+', encoding='cp437') as ipynb_file: - openvino_notebooks_paths_list = ipynb_file.read() - - for notebook_file in [nb for nb in os.listdir(self.nb_path) if verify_notebook_name(nb)]: - - notebook_ipynb_ext = notebook_file[:-16] + ".ipynb" - nb_path_match = [line for line in openvino_notebooks_paths_list.split('\n') if notebook_ipynb_ext in line] - nb_repo_path = ''.join(nb_path_match) - notebook_item = '-'.join(notebook_file.split('-')[:-2]) - - local_install = ".. |installation_link| raw:: html\n\n local installation \n\n" - binder_badge = ".. raw:: html\n\n Binder\n\n" - colab_badge = ".. raw:: html\n\n Google Colab\n\n" - github_badge = ".. raw:: html\n\n Github

\n\n" - - binder_data = { - "owner": repo_owner, - "repo": repo_name, - "folder": repo_directory, - "link_git": github_badge, - "link_binder": binder_badge if notebook_item in buttons_list else "", - "link_colab": colab_badge if notebook_item in cbuttons_list else "", - "installation_link": local_install - } - - if notebook_item in buttons_list or notebook_item in cbuttons_list: - template = template_with_colab_and_binder - else: - template = template_without_binder - - button_text = create_content(template, binder_data, notebook_file) - if not add_content_below(button_text, f"{self.nb_path}/{notebook_file}"): - raise FileNotFoundError("Unable to modify file") - - -def main(): - buttons_list = fetch_binder_list(file_with_binder_notebooks) - cbuttons_list = fetch_colab_list(file_with_colab_notebooks) - parser = argparse.ArgumentParser() - parser.add_argument('sourcedir', type=Path) - parser.add_argument('outdir', type=Path) - args = parser.parse_args() - sourcedir = args.sourcedir - outdir = args.outdir - - main_tutorials_file = Path('../../docs/articles_en/get-started/learn-openvino/interactive-tutorials-python.rst').resolve(strict=True) - add_glob_directive(main_tutorials_file) - shutil.copytree(sourcedir, outdir) - # Run processing on downloaded files in notebooks directory - nbp = NbProcessor(outdir) - # Add Binder, Google Colab, GitHub badges to notebooks - nbp.add_binder(buttons_list, cbuttons_list) - - -if __name__ == '__main__': - main() - diff --git a/docs/nbdoc/requirements.txt b/docs/nbdoc/requirements.txt deleted file mode 100644 index 9aab7c0470497f..00000000000000 --- a/docs/nbdoc/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -jinja2 -requests -lxml \ No newline at end of file diff --git a/docs/nbdoc/utils.py b/docs/nbdoc/utils.py deleted file mode 100644 index 6533c23a3de747..00000000000000 --- a/docs/nbdoc/utils.py +++ /dev/null @@ -1,57 +0,0 @@ -from jinja2 import Template - - -def create_content(template: str, notebooks_data: dict, file_name: str): - """Filling rst template with data - - :param template: jinja template that will be filled with notebook data - :type template: str - :param notebooks_data: data structure containing information required to fill template - :type notebooks_data: dict - :param file_name: file name - :type file_name: str - :returns: Filled template - :rtype: str - - """ - template = Template(template) - notebooks_data["notebook"] = "-".join(file_name.split("-")[:-2]) - return template.render(notebooks_data) - - -def add_content_below(text: str, path: str, line=3) -> bool: - """Add additional content (like binder button) to existing rst file - - :param text: Text that will be added inside rst file - :type text: str - :param path: Path to modified file - :type path: str - :param line: Line number that content will be added. Defaults to 3. - :type line: int - :returns: Informs about success or failure in modifying file - :rtype: bool - - """ - try: - with open(path, "r+", encoding="utf-8") as file: - current_file = file.readlines() - current_file[line:line] = text - file.seek(0) - file.writelines(current_file) - return True - except FileNotFoundError: - return False - - -def verify_notebook_name(notebook_name: str) -> bool: - """Verification based on notebook name - - :param notebook_name: Notebook name by default keeps convention: - name-with-dashes-with-output.rst, - example: hello-world-with-output.rst - :type notebook_name: str - :returns: Return if notebook meets requirements - :rtype: bool - - """ - return notebook_name[-16:] == "-with-output.rst" \ No newline at end of file diff --git a/docs/notebooks/3D-pose-estimation-with-output.rst b/docs/notebooks/3D-pose-estimation-with-output.rst deleted file mode 100644 index c24e62d6dd2ebd..00000000000000 --- a/docs/notebooks/3D-pose-estimation-with-output.rst +++ /dev/null @@ -1,689 +0,0 @@ -Live 3D Human Pose Estimation with OpenVINO -=========================================== - -This notebook demonstrates live 3D Human Pose Estimation with OpenVINO -via a webcam. We utilize the model -`human-pose-estimation-3d-0001 `__ -from `Open Model -Zoo `__. At the end -of this notebook, you will see live inference results from your webcam -(if available). Alternatively, you can also upload a video file to test -out the algorithms. **Make sure you have properly installed -the**\ `Jupyter -extension `__\ **and -been using JupyterLab to run the demo as suggested in the -``README.md``** - - **NOTE**: *To use a webcam, you must run this Jupyter notebook on a - computer with a webcam. If you run on a remote server, the webcam - will not work. However, you can still do inference on a video file in - the final step. This demo utilizes the Python interface in - ``Three.js`` integrated with WebGL to process data from the model - inference. These results are processed and displayed in the - notebook.* - -*To ensure that the results are displayed correctly, run the code in a -recommended browser on one of the following operating systems:* *Ubuntu, -Windows: Chrome* *macOS: Safari* - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Imports <#imports>`__ -- `The model <#the-model>`__ - - - `Download the model <#download-the-model>`__ - - `Convert Model to OpenVINO IR - format <#convert-model-to-openvino-ir-format>`__ - - `Select inference device <#select-inference-device>`__ - - `Load the model <#load-the-model>`__ - -- `Processing <#processing>`__ - - - `Model Inference <#model-inference>`__ - - `Draw 2D Pose Overlays <#draw-2d-pose-overlays>`__ - - `Main Processing Function <#main-processing-function>`__ - -- `Run <#run>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Make sure your `Jupyter -extension `__ -is working properly. To avoid errors that may arise from the version of -the dependency package, it is recommended to use the **JupyterLab** -instead of the Jupyter notebook to display image results. - -:: - - - pip install --upgrade pip && pip install -r requirements.txt - - jupyter labextension install --no-build @jupyter-widgets/jupyterlab-manager - - jupyter labextension install --no-build jupyter-datawidgets/extension - - jupyter labextension install jupyter-threejs - - jupyter labextension list - -You should see: - -:: - - JupyterLab v... - ... - jupyterlab-datawidgets v... enabled OK - @jupyter-widgets/jupyterlab-manager v... enabled OK - jupyter-threejs v... enabled OK - -Prerequisites -------------- - - - -**The ``pythreejs`` extension may not display properly when using a -Jupyter Notebook release. Therefore, it is recommended to use Jupyter -Lab instead.** - -.. code:: ipython3 - - import platform - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - - %pip install pythreejs "openvino>=2024.4.0" "opencv-python" "torch" "tqdm" --extra-index-url https://download.pytorch.org/whl/cpu - - -.. parsed-literal:: - - Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu - Collecting pythreejs - Using cached pythreejs-2.4.2-py3-none-any.whl.metadata (5.4 kB) - Collecting openvino>=2024.4.0 - Using cached openvino-2024.4.0-16579-cp38-cp38-manylinux2014_x86_64.whl.metadata (8.3 kB) - Collecting opencv-python - Using cached opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB) - Collecting torch - Using cached https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp38-cp38-linux_x86_64.whl (194.9 MB) - Collecting tqdm - Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB) - Requirement already satisfied: ipywidgets>=7.2.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pythreejs) (8.1.5) - Collecting ipydatawidgets>=1.1.1 (from pythreejs) - Using cached ipydatawidgets-4.3.5-py2.py3-none-any.whl.metadata (1.4 kB) - Collecting numpy (from pythreejs) - Using cached numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB) - Requirement already satisfied: traitlets in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pythreejs) (5.14.3) - Collecting openvino-telemetry>=2023.2.1 (from openvino>=2024.4.0) - Using cached openvino_telemetry-2025.0.0-py3-none-any.whl.metadata (2.3 kB) - Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2024.4.0) (24.2) - Collecting filelock (from torch) - Using cached filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB) - Requirement already satisfied: typing-extensions>=4.8.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (4.12.2) - Collecting sympy (from torch) - Using cached sympy-1.13.3-py3-none-any.whl.metadata (12 kB) - Collecting networkx (from torch) - Using cached https://download.pytorch.org/whl/networkx-3.3-py3-none-any.whl.metadata (5.1 kB) - Requirement already satisfied: jinja2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (3.1.5) - Collecting fsspec (from torch) - Downloading fsspec-2025.2.0-py3-none-any.whl.metadata (11 kB) - Collecting traittypes>=0.2.0 (from ipydatawidgets>=1.1.1->pythreejs) - Using cached traittypes-0.2.1-py2.py3-none-any.whl.metadata (1.0 kB) - Requirement already satisfied: comm>=0.1.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (0.2.2) - Requirement already satisfied: ipython>=6.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (8.12.3) - Requirement already satisfied: widgetsnbextension~=4.0.12 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (4.0.13) - Requirement already satisfied: jupyterlab-widgets~=3.0.12 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (3.0.13) - Requirement already satisfied: MarkupSafe>=2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from jinja2->torch) (2.1.5) - INFO: pip is looking at multiple versions of networkx to determine which version is compatible with other requirements. This could take a while. - Collecting networkx (from torch) - Using cached https://download.pytorch.org/whl/networkx-3.2.1-py3-none-any.whl (1.6 MB) - Using cached networkx-3.1-py3-none-any.whl.metadata (5.3 kB) - Collecting mpmath<1.4,>=1.1.0 (from sympy->torch) - Using cached https://download.pytorch.org/whl/mpmath-1.3.0-py3-none-any.whl (536 kB) - Requirement already satisfied: backcall in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.2.0) - Requirement already satisfied: decorator in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (5.1.1) - Requirement already satisfied: jedi>=0.16 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.19.2) - Requirement already satisfied: matplotlib-inline in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.1.7) - Requirement already satisfied: pickleshare in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.7.5) - Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (3.0.50) - Requirement already satisfied: pygments>=2.4.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (2.19.1) - Requirement already satisfied: stack-data in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.6.3) - Requirement already satisfied: pexpect>4.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (4.9.0) - Requirement already satisfied: parso<0.9.0,>=0.8.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.8.4) - Requirement already satisfied: ptyprocess>=0.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.7.0) - Requirement already satisfied: wcwidth in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.2.13) - Requirement already satisfied: executing>=1.2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from stack-data->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (2.2.0) - Requirement already satisfied: asttokens>=2.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from stack-data->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (3.0.0) - Requirement already satisfied: pure-eval in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from stack-data->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.2.3) - Using cached pythreejs-2.4.2-py3-none-any.whl (3.4 MB) - Using cached openvino-2024.4.0-16579-cp38-cp38-manylinux2014_x86_64.whl (42.6 MB) - Using cached opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (63.0 MB) - Using cached tqdm-4.67.1-py3-none-any.whl (78 kB) - Using cached ipydatawidgets-4.3.5-py2.py3-none-any.whl (271 kB) - Using cached numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB) - Using cached openvino_telemetry-2025.0.0-py3-none-any.whl (24 kB) - Using cached filelock-3.16.1-py3-none-any.whl (16 kB) - Downloading fsspec-2025.2.0-py3-none-any.whl (184 kB) - Using cached networkx-3.1-py3-none-any.whl (2.1 MB) - Using cached sympy-1.13.3-py3-none-any.whl (6.2 MB) - Using cached traittypes-0.2.1-py2.py3-none-any.whl (8.6 kB) - Installing collected packages: openvino-telemetry, mpmath, traittypes, tqdm, sympy, numpy, networkx, fsspec, filelock, torch, openvino, opencv-python, ipydatawidgets, pythreejs - Successfully installed filelock-3.16.1 fsspec-2025.2.0 ipydatawidgets-4.3.5 mpmath-1.3.0 networkx-3.1 numpy-1.24.4 opencv-python-4.11.0.86 openvino-2024.4.0 openvino-telemetry-2025.0.0 pythreejs-2.4.2 sympy-1.13.3 torch-2.4.1+cpu tqdm-4.67.1 traittypes-0.2.1 - Note: you may need to restart the kernel to use updated packages. - - -Imports -------- - - - -.. code:: ipython3 - - import collections - import time - from pathlib import Path - - import cv2 - import ipywidgets as widgets - import numpy as np - from IPython.display import clear_output, display - import openvino as ov - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - with open("notebook_utils.py", "w") as f: - f.write(r.text) - - if not Path("engine3js.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/engine3js.py", - ) - with open("engine3js.py", "w") as f: - f.write(r.text) - - import notebook_utils as utils - import engine3js as engine - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("3D-pose-estimation.ipynb") - -The model ---------- - - - -Download the model -~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from notebook_utils import download_file - import tarfile - - - # directory where model will be downloaded - base_model_dir = Path("model") - - if not base_model_dir.exists(): - download_file( - "https://storage.openvinotoolkit.org/repositories/open_model_zoo/public/2022.1/human-pose-estimation-3d-0001/human-pose-estimation-3d.tar.gz", - directory=base_model_dir, - ) - - ckpt_file = base_model_dir / "human-pose-estimation-3d-0001.pth" - - if not ckpt_file.exists(): - with tarfile.open(base_model_dir / "human-pose-estimation-3d.tar.gz") as f: - f.extractall(base_model_dir) - - - -.. parsed-literal:: - - human-pose-estimation-3d.tar.gz: 0%| | 0.00/17.6M [00:00`__ -from Open Model Zoo. - -.. code:: ipython3 - - # 3D edge index array - body_edges = np.array( - [ - [0, 1], - [0, 9], - [9, 10], - [10, 11], # neck - r_shoulder - r_elbow - r_wrist - [0, 3], - [3, 4], - [4, 5], # neck - l_shoulder - l_elbow - l_wrist - [1, 15], - [15, 16], # nose - l_eye - l_ear - [1, 17], - [17, 18], # nose - r_eye - r_ear - [0, 6], - [6, 7], - [7, 8], # neck - l_hip - l_knee - l_ankle - [0, 12], - [12, 13], - [13, 14], # neck - r_hip - r_knee - r_ankle - ] - ) - - - body_edges_2d = np.array( - [ - [0, 1], # neck - nose - [1, 16], - [16, 18], # nose - l_eye - l_ear - [1, 15], - [15, 17], # nose - r_eye - r_ear - [0, 3], - [3, 4], - [4, 5], # neck - l_shoulder - l_elbow - l_wrist - [0, 9], - [9, 10], - [10, 11], # neck - r_shoulder - r_elbow - r_wrist - [0, 6], - [6, 7], - [7, 8], # neck - l_hip - l_knee - l_ankle - [0, 12], - [12, 13], - [13, 14], # neck - r_hip - r_knee - r_ankle - ] - ) - - - def draw_poses(frame, poses_2d, scaled_img, use_popup): - """ - Draw 2D pose overlays on the image to visualize estimated poses. - Joints are drawn as circles and limbs are drawn as lines. - - :param frame: the input image - :param poses_2d: array of human joint pairs - """ - for pose in poses_2d: - pose = np.array(pose[0:-1]).reshape((-1, 3)).transpose() - was_found = pose[2] > 0 - - pose[0], pose[1] = ( - pose[0] * frame.shape[1] / scaled_img.shape[1], - pose[1] * frame.shape[0] / scaled_img.shape[0], - ) - - # Draw joints. - for edge in body_edges_2d: - if was_found[edge[0]] and was_found[edge[1]]: - cv2.line( - frame, - tuple(pose[0:2, edge[0]].astype(np.int32)), - tuple(pose[0:2, edge[1]].astype(np.int32)), - (255, 255, 0), - 4, - cv2.LINE_AA, - ) - # Draw limbs. - for kpt_id in range(pose.shape[1]): - if pose[2, kpt_id] != -1: - cv2.circle( - frame, - tuple(pose[0:2, kpt_id].astype(np.int32)), - 3, - (0, 255, 255), - -1, - cv2.LINE_AA, - ) - - return frame - -Main Processing Function -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Run 3D pose estimation on the specified source. It could be either a -webcam feed or a video file. - -.. code:: ipython3 - - def run_pose_estimation(source=0, flip=False, use_popup=False, skip_frames=0): - """ - 2D image as input, using OpenVINO as inference backend, - get joints 3D coordinates, and draw 3D human skeleton in the scene - - :param source: The webcam number to feed the video stream with primary webcam set to "0", or the video path. - :param flip: To be used by VideoPlayer function for flipping capture image. - :param use_popup: False for showing encoded frames over this notebook, True for creating a popup window. - :param skip_frames: Number of frames to skip at the beginning of the video. - """ - - focal_length = -1 # default - stride = 8 - player = None - skeleton_set = None - - try: - # create video player to play with target fps video_path - # get the frame from camera - # You can skip first N frames to fast forward video. change 'skip_first_frames' - player = utils.VideoPlayer(source, flip=flip, fps=30, skip_first_frames=skip_frames) - # start capturing - player.start() - - input_image = player.next() - # set the window size - resize_scale = 450 / input_image.shape[1] - windows_width = int(input_image.shape[1] * resize_scale) - windows_height = int(input_image.shape[0] * resize_scale) - - # use visualization library - engine3D = engine.Engine3js(grid=True, axis=True, view_width=windows_width, view_height=windows_height) - - if use_popup: - # display the 3D human pose in this notebook, and origin frame in popup window - display(engine3D.renderer) - title = "Press ESC to Exit" - cv2.namedWindow(title, cv2.WINDOW_KEEPRATIO | cv2.WINDOW_AUTOSIZE) - else: - # set the 2D image box, show both human pose and image in the notebook - imgbox = widgets.Image(format="jpg", height=windows_height, width=windows_width) - display(widgets.HBox([engine3D.renderer, imgbox])) - - skeleton = engine.Skeleton(body_edges=body_edges) - - processing_times = collections.deque() - - while True: - # grab the frame - frame = player.next() - if frame is None: - print("Source ended") - break - - # resize image and change dims to fit neural network input - # (see https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/human-pose-estimation-3d-0001) - scaled_img = cv2.resize(frame, dsize=(model.inputs[0].shape[3], model.inputs[0].shape[2])) - - if focal_length < 0: # Focal length is unknown - focal_length = np.float32(0.8 * scaled_img.shape[1]) - - # inference start - start_time = time.time() - # get results - inference_result = model_infer(scaled_img, stride) - - # inference stop - stop_time = time.time() - processing_times.append(stop_time - start_time) - # Process the point to point coordinates of the data - poses_3d, poses_2d = engine.parse_poses(inference_result, 1, stride, focal_length, True) - - # use processing times from last 200 frames - if len(processing_times) > 200: - processing_times.popleft() - - processing_time = np.mean(processing_times) * 1000 - fps = 1000 / processing_time - - if len(poses_3d) > 0: - # From here, you can rotate the 3D point positions using the function "draw_poses", - # or you can directly make the correct mapping below to properly display the object image on the screen - poses_3d_copy = poses_3d.copy() - x = poses_3d_copy[:, 0::4] - y = poses_3d_copy[:, 1::4] - z = poses_3d_copy[:, 2::4] - poses_3d[:, 0::4], poses_3d[:, 1::4], poses_3d[:, 2::4] = ( - -z + np.ones(poses_3d[:, 2::4].shape) * 200, - -y + np.ones(poses_3d[:, 2::4].shape) * 100, - -x, - ) - - poses_3d = poses_3d.reshape(poses_3d.shape[0], 19, -1)[:, :, 0:3] - people = skeleton(poses_3d=poses_3d) - - try: - engine3D.scene_remove(skeleton_set) - except Exception: - pass - - engine3D.scene_add(people) - skeleton_set = people - - # draw 2D - frame = draw_poses(frame, poses_2d, scaled_img, use_popup) - - else: - try: - engine3D.scene_remove(skeleton_set) - skeleton_set = None - except Exception: - pass - - cv2.putText( - frame, - f"Inference time: {processing_time:.1f}ms ({fps:.1f} FPS)", - (10, 30), - cv2.FONT_HERSHEY_COMPLEX, - 0.7, - (0, 0, 255), - 1, - cv2.LINE_AA, - ) - - if use_popup: - cv2.imshow(title, frame) - key = cv2.waitKey(1) - # escape = 27, use ESC to exit - if key == 27: - break - else: - # encode numpy array to jpg - imgbox.value = cv2.imencode( - ".jpg", - frame, - params=[cv2.IMWRITE_JPEG_QUALITY, 90], - )[1].tobytes() - - engine3D.renderer.render(engine3D.scene, engine3D.cam) - - except KeyboardInterrupt: - print("Interrupted") - except RuntimeError as e: - print(e) - finally: - clear_output() - if player is not None: - # stop capturing - player.stop() - if use_popup: - cv2.destroyAllWindows() - if skeleton_set: - engine3D.scene_remove(skeleton_set) - -Run ---- - - - -Run, using a webcam as the video input. By default, the primary webcam -is set with ``source=0``. If you have multiple webcams, each one will be -assigned a consecutive number starting at 0. Set ``flip=True`` when -using a front-facing camera. Some web browsers, especially Mozilla -Firefox, may cause flickering. If you experience flickering, set -``use_popup=True``. - - **NOTE**: - - *1. To use this notebook with a webcam, you need to run the notebook - on a computer with a webcam. If you run the notebook on a server - (e.g. Binder), the webcam will not work.* - - *2. Popup mode may not work if you run this notebook on a remote - computer (e.g. Binder).* - -If you do not have a webcam, you can still run this demo with a video -file. Any `format supported by -OpenCV `__ -will work. - -Using the following method, you can click and move your mouse over the -picture on the left to interact. - -.. code:: ipython3 - - from notebook_utils import download_file - - USE_WEBCAM = False - - cam_id = 0 - if not Path("face-demographics-walking.mp4").exists(): - download_file( - "https://storage.openvinotoolkit.org/data/test_data/videos/face-demographics-walking.mp4", - ) - video_path = "face-demographics-walking.mp4" - - source = cam_id if USE_WEBCAM else video_path - - run_pose_estimation(source=source, flip=isinstance(source, int), use_popup=False) diff --git a/docs/notebooks/3D-segmentation-point-clouds-with-output.rst b/docs/notebooks/3D-segmentation-point-clouds-with-output.rst deleted file mode 100644 index 8f247591e6647c..00000000000000 --- a/docs/notebooks/3D-segmentation-point-clouds-with-output.rst +++ /dev/null @@ -1,333 +0,0 @@ -Part Segmentation of 3D Point Clouds with OpenVINO™ -=================================================== - -This notebook demonstrates how to process `point -cloud `__ data and run 3D -Part Segmentation with OpenVINO. We use the -`PointNet `__ pre-trained model to -detect each part of a chair and return its category. - -PointNet --------- - -PointNet was proposed by Charles Ruizhongtai Qi, a researcher at -Stanford University in 2016: `PointNet: Deep Learning on Point Sets for -3D Classification and -Segmentation `__. The motivation -behind the research is to classify and segment 3D representations of -images. They use a data structure called point cloud, which is a set of -points that represents a 3D shape or object. PointNet provides a unified -architecture for applications ranging from object classification, part -segmentation, to scene semantic parsing. It is highly efficient and -effective, showing strong performance on par or even better than state -of the art. - - -**Table of contents:** - - -- `Imports <#imports>`__ -- `Prepare the Model <#prepare-the-model>`__ -- `Data Processing Module <#data-processing-module>`__ -- `Visualize the original 3D data <#visualize-the-original-3d-data>`__ -- `Run inference <#run-inference>`__ - - - `Select inference device <#select-inference-device>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - %pip install -q "openvino>=2023.1.0" "tqdm" "matplotlib>=3.4" - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - - -Imports -------- - - - -.. code:: ipython3 - - from pathlib import Path - from typing import Union - import numpy as np - import matplotlib.pyplot as plt - import openvino as ov - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("3D-segmentation-point-clouds.ipynb") - -Prepare the Model ------------------ - - - -Download the pre-trained PointNet ONNX model. This pre-trained model is -provided by `axinc-ai `__, and you can -find more point clouds examples -`here `__. - -.. code:: ipython3 - - # Set the data and model directories, model source URL and model filename - MODEL_DIR = Path("model") - MODEL_DIR.mkdir(exist_ok=True) - onnx_model_path = MODEL_DIR / "chair_100.onnx" - - if not onnx_model_path.exists(): - download_file( - "https://storage.googleapis.com/ailia-models/pointnet_pytorch/chair_100.onnx", - directory=Path(MODEL_DIR), - show_progress=False, - ) - -Convert the ONNX model to OpenVINO IR. An OpenVINO IR (Intermediate -Representation) model consists of an ``.xml`` file, containing -information about network topology, and a ``.bin`` file, containing the -weights and biases binary data. Model conversion Python API is used for -conversion of ONNX model to OpenVINO IR. The ``ov.convert_model`` Python -function returns an OpenVINO model ready to load on a device and start -making predictions. We can save it on a disk for next usage with -``ov.save_model``. For more information about model conversion Python -API, see this -`page `__. - -.. code:: ipython3 - - ir_model_xml = onnx_model_path.with_suffix(".xml") - - core = ov.Core() - - if not ir_model_xml.exists(): - # Convert model to OpenVINO Model - model = ov.convert_model(onnx_model_path) - # Serialize model in OpenVINO IR format xml + bin - ov.save_model(model, ir_model_xml) - else: - # Read model - model = core.read_model(model=ir_model_xml) - -Data Processing Module ----------------------- - - - -.. code:: ipython3 - - def load_data(point_file: Union[str, Path]): - """ - Load the point cloud data and convert it to ndarray - - Parameters: - point_file: string, path of .pts data - Returns: - point_set: point clound represented in np.array format - """ - - point_set = np.loadtxt(point_file).astype(np.float32) - - # normailization - point_set = point_set - np.expand_dims(np.mean(point_set, axis=0), 0) # center - dist = np.max(np.sqrt(np.sum(point_set**2, axis=1)), 0) - point_set = point_set / dist # scale - - return point_set - - - def visualize(point_set: np.ndarray): - """ - Create a 3D view for data visualization - - Parameters: - point_set: np.ndarray, the coordinate data in X Y Z format - """ - - fig = plt.figure(dpi=192, figsize=(4, 4)) - ax = fig.add_subplot(111, projection="3d") - X = point_set[:, 0] - Y = point_set[:, 2] - Z = point_set[:, 1] - - # Scale the view of each axis to adapt to the coordinate data distribution - max_range = np.array([X.max() - X.min(), Y.max() - Y.min(), Z.max() - Z.min()]).max() * 0.5 - mid_x = (X.max() + X.min()) * 0.5 - mid_y = (Y.max() + Y.min()) * 0.5 - mid_z = (Z.max() + Z.min()) * 0.5 - ax.set_xlim(mid_x - max_range, mid_x + max_range) - ax.set_ylim(mid_y - max_range, mid_y + max_range) - ax.set_zlim(mid_z - max_range, mid_z + max_range) - - plt.tick_params(labelsize=5) - ax.set_xlabel("X", fontsize=10) - ax.set_ylabel("Y", fontsize=10) - ax.set_zlabel("Z", fontsize=10) - - return ax - -Visualize the original 3D data ------------------------------- - - - -The point cloud data can be downloaded from -`ShapeNet `__, -a large-scale dataset of 3D shapes. Here, we select the 3D data of a -chair for example. - -.. code:: ipython3 - - # Download data from the openvino_notebooks storage - point_data_path = Path("data") / "chair.pts" - - if not point_data_path.exists(): - point_data = download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/pts/chair.pts", - directory="data", - ) - - points = load_data(str(point_data)) - X = points[:, 0] - Y = points[:, 2] - Z = points[:, 1] - ax = visualize(points) - ax.scatter3D(X, Y, Z, s=5, cmap="jet", marker="o", label="chair") - ax.set_title("3D Visualization") - plt.legend(loc="upper right", fontsize=8) - plt.show() - - - -.. parsed-literal:: - - chair.pts: 0%| | 0.00/69.2k [00:00`__ from `Open -Model Zoo `__, -specifically an -`Encoder `__ -and a -`Decoder `__. -Both models create a sequence to sequence (``"seq2seq"``) [1] system to -identify the human activities for `Kinetics-400 -dataset `__. The -models use the Video Transformer approach with ResNet34 encoder [2]. The -notebook shows how to create the following pipeline: - -Final part of this notebook shows live inference results from a webcam. -Additionally, you can also upload a video file. - -**NOTE**: To use a webcam, you must run this Jupyter notebook on a -computer with a webcam. If you run on a server, the webcam will not -work. However, you can still do inference on a video in the final step. - --------------- - -[1] seq2seq: Deep learning models that take a sequence of items to the -input and output. In this case, input: video frames, output: actions -sequence. This ``"seq2seq"`` is composed of an encoder and a decoder. -The encoder captures ``"context"`` of the inputs to be analyzed by the -decoder, and finally gets the human action and confidence. - -[2] `Video -Transformer `__ -and -`ResNet34 `__. - - -**Table of contents:** - - -- `Imports <#imports>`__ -- `The models <#the-models>`__ - - - `Download the models <#download-the-models>`__ - - `Load your labels <#load-your-labels>`__ - - `Load the models <#load-the-models>`__ - - - `Model Initialization - function <#model-initialization-function>`__ - - `Initialization for Encoder and - Decoder <#initialization-for-encoder-and-decoder>`__ - - - `Helper functions <#helper-functions>`__ - - `AI Functions <#ai-functions>`__ - - `Main Processing Function <#main-processing-function>`__ - - `Run Action Recognition <#run-action-recognition>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - %pip install -q "openvino>=2024.0.0" "opencv-python" "tqdm" - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - - -Imports -------- - - - -.. code:: ipython3 - - import collections - import os - import time - from typing import Tuple, List - - from pathlib import Path - - import cv2 - import numpy as np - from IPython import display - import openvino as ov - from openvino.runtime.ie_api import CompiledModel - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - import notebook_utils as utils - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("action-recognition-webcam.ipynb") - -The models ----------- - - - -Download the models -~~~~~~~~~~~~~~~~~~~ - - - -Use the ``download_ir_model``, a function from the ``notebook_utils`` -file. It automatically creates a directory structure and downloads the -selected model. - -In this case you can use ``"action-recognition-0001"`` as a model name, -and the system automatically downloads the two models -``"action-recognition-0001-encoder"`` and -``"action-recognition-0001-decoder"`` - - **NOTE**: If you want to download another model, such as - ``"driver-action-recognition-adas-0002"`` - (``"driver-action-recognition-adas-0002-encoder"`` + - ``"driver-action-recognition-adas-0002-decoder"``), replace the name - of the model in the code below. Using a model outside the list can - require different pre- and post-processing. - -.. code:: ipython3 - - # A directory where the model will be downloaded. - base_model_dir = "model" - # The name of the model from Open Model Zoo. - model_name = "action-recognition-0001" - # Selected precision (FP32, FP16, FP16-INT8). - precision = "FP16" - model_path_decoder = f"model/intel/{model_name}/{model_name}-decoder/{precision}/{model_name}-decoder.xml" - model_path_encoder = f"model/intel/{model_name}/{model_name}-encoder/{precision}/{model_name}-encoder.xml" - encoder_url = f"https://storage.openvinotoolkit.org/repositories/open_model_zoo/temp/{model_name}/{model_name}-encoder/{precision}/{model_name}-encoder.xml" - decoder_url = f"https://storage.openvinotoolkit.org/repositories/open_model_zoo/temp/{model_name}/{model_name}-decoder/{precision}/{model_name}-decoder.xml" - - if not os.path.exists(model_path_decoder): - utils.download_ir_model(decoder_url, Path(model_path_decoder).parent) - if not os.path.exists(model_path_encoder): - utils.download_ir_model(encoder_url, Path(model_path_encoder).parent) - - - -.. parsed-literal:: - - action-recognition-0001-decoder.bin: 0%| | 0.00/14.4M [00:00`__, and -also provides the text file embedded into this notebook. - - **NOTE**: If you want to run - ``"driver-action-recognition-adas-0002"`` model, replace the - ``kinetics.txt`` file to ``driver_actions.txt``. - -.. code:: ipython3 - - # Download the text from the openvino_notebooks storage - - if not (Path("data") / "kinetics.txt").exists(): - vocab_file_path = utils.download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/text/kinetics.txt", - directory="data", - ) - - with vocab_file_path.open(mode="r") as f: - labels = [line.strip() for line in f] - - print(labels[0:9], np.shape(labels)) - - - -.. parsed-literal:: - - kinetics.txt: 0%| | 0.00/5.82k [00:00 Tuple: - """ - Read the network and weights from a file, load the - model on CPU and get input and output names of nodes - - :param: - model: model architecture path *.xml - device: inference device - :retuns: - compiled_model: Compiled model - input_key: Input node for model - output_key: Output node for model - """ - - # Read the network and corresponding weights from a file. - model = core.read_model(model=model_path) - # Compile the model for specified device. - compiled_model = core.compile_model(model=model, device_name=device) - # Get input and output names of nodes. - input_keys = compiled_model.input(0) - output_keys = compiled_model.output(0) - return input_keys, output_keys, compiled_model - -Initialization for Encoder and Decoder -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -.. code:: ipython3 - - # Encoder initialization - input_key_en, output_keys_en, compiled_model_en = model_init(model_path_encoder, device.value) - # Decoder initialization - input_key_de, output_keys_de, compiled_model_de = model_init(model_path_decoder, device.value) - - # Get input size - Encoder. - height_en, width_en = list(input_key_en.shape)[2:] - # Get input size - Decoder. - frames2decode = list(input_key_de.shape)[0:][1] - -Helper functions -~~~~~~~~~~~~~~~~ - - - -Use the following helper functions for preprocessing and postprocessing -frames: - -1. Preprocess the input image before running the Encoder model. - (``center_crop`` and ``adaptative_resize``) -2. Decode top-3 probabilities into label names. (``decode_output``) -3. Draw the Region of Interest (ROI) over the video. - (``rec_frame_display``) -4. Prepare the frame for displaying label names over the video. - (``display_text_fnc``) - -.. code:: ipython3 - - def center_crop(frame: np.ndarray) -> np.ndarray: - """ - Center crop squared the original frame to standardize the input image to the encoder model - - :param frame: input frame - :returns: center-crop-squared frame - """ - img_h, img_w, _ = frame.shape - min_dim = min(img_h, img_w) - start_x = int((img_w - min_dim) / 2.0) - start_y = int((img_h - min_dim) / 2.0) - roi = [start_y, (start_y + min_dim), start_x, (start_x + min_dim)] - return frame[start_y : (start_y + min_dim), start_x : (start_x + min_dim), ...], roi - - - def adaptive_resize(frame: np.ndarray, size: int) -> np.ndarray: - """ - The frame going to be resized to have a height of size or a width of size - - :param frame: input frame - :param size: input size to encoder model - :returns: resized frame, np.array type - """ - h, w, _ = frame.shape - scale = size / min(h, w) - w_scaled, h_scaled = int(w * scale), int(h * scale) - if w_scaled == w and h_scaled == h: - return frame - return cv2.resize(frame, (w_scaled, h_scaled)) - - - def decode_output(probs: np.ndarray, labels: np.ndarray, top_k: int = 3) -> np.ndarray: - """ - Decodes top probabilities into corresponding label names - - :param probs: confidence vector for 400 actions - :param labels: list of actions - :param top_k: The k most probable positions in the list of labels - :returns: decoded_labels: The k most probable actions from the labels list - decoded_top_probs: confidence for the k most probable actions - """ - top_ind = np.argsort(-1 * probs)[:top_k] - out_label = np.array(labels)[top_ind.astype(int)] - decoded_labels = [out_label[0][0], out_label[0][1], out_label[0][2]] - top_probs = np.array(probs)[0][top_ind.astype(int)] - decoded_top_probs = [top_probs[0][0], top_probs[0][1], top_probs[0][2]] - return decoded_labels, decoded_top_probs - - - def rec_frame_display(frame: np.ndarray, roi) -> np.ndarray: - """ - Draw a rec frame over actual frame - - :param frame: input frame - :param roi: Region of interest, image section processed by the Encoder - :returns: frame with drawed shape - - """ - - cv2.line(frame, (roi[2] + 3, roi[0] + 3), (roi[2] + 3, roi[0] + 100), (0, 200, 0), 2) - cv2.line(frame, (roi[2] + 3, roi[0] + 3), (roi[2] + 100, roi[0] + 3), (0, 200, 0), 2) - cv2.line(frame, (roi[3] - 3, roi[1] - 3), (roi[3] - 3, roi[1] - 100), (0, 200, 0), 2) - cv2.line(frame, (roi[3] - 3, roi[1] - 3), (roi[3] - 100, roi[1] - 3), (0, 200, 0), 2) - cv2.line(frame, (roi[3] - 3, roi[0] + 3), (roi[3] - 3, roi[0] + 100), (0, 200, 0), 2) - cv2.line(frame, (roi[3] - 3, roi[0] + 3), (roi[3] - 100, roi[0] + 3), (0, 200, 0), 2) - cv2.line(frame, (roi[2] + 3, roi[1] - 3), (roi[2] + 3, roi[1] - 100), (0, 200, 0), 2) - cv2.line(frame, (roi[2] + 3, roi[1] - 3), (roi[2] + 100, roi[1] - 3), (0, 200, 0), 2) - # Write ROI over actual frame - FONT_STYLE = cv2.FONT_HERSHEY_SIMPLEX - org = (roi[2] + 3, roi[1] - 3) - org2 = (roi[2] + 2, roi[1] - 2) - FONT_SIZE = 0.5 - FONT_COLOR = (0, 200, 0) - FONT_COLOR2 = (0, 0, 0) - cv2.putText(frame, "ROI", org2, FONT_STYLE, FONT_SIZE, FONT_COLOR2) - cv2.putText(frame, "ROI", org, FONT_STYLE, FONT_SIZE, FONT_COLOR) - return frame - - - def display_text_fnc(frame: np.ndarray, display_text: str, index: int): - """ - Include a text on the analyzed frame - - :param frame: input frame - :param display_text: text to add on the frame - :param index: index line dor adding text - - """ - # Configuration for displaying images with text. - FONT_COLOR = (255, 255, 255) - FONT_COLOR2 = (0, 0, 0) - FONT_STYLE = cv2.FONT_HERSHEY_DUPLEX - FONT_SIZE = 0.7 - TEXT_VERTICAL_INTERVAL = 25 - TEXT_LEFT_MARGIN = 15 - # ROI over actual frame - (processed, roi) = center_crop(frame) - # Draw a ROI over actual frame. - frame = rec_frame_display(frame, roi) - # Put a text over actual frame. - text_loc = (TEXT_LEFT_MARGIN, TEXT_VERTICAL_INTERVAL * (index + 1)) - text_loc2 = (TEXT_LEFT_MARGIN + 1, TEXT_VERTICAL_INTERVAL * (index + 1) + 1) - cv2.putText(frame, display_text, text_loc2, FONT_STYLE, FONT_SIZE, FONT_COLOR2) - cv2.putText(frame, display_text, text_loc, FONT_STYLE, FONT_SIZE, FONT_COLOR) - -AI Functions -~~~~~~~~~~~~ - - - -Following the pipeline above, you will use the next functions to: - -1. Preprocess a frame before running the Encoder. (``preprocessing``) -2. Encoder Inference per frame. (``encoder``) -3. Decoder inference per set of frames. (``decoder``) -4. Normalize the Decoder output to get confidence values per action - recognition label. (``softmax``) - -.. code:: ipython3 - - def preprocessing(frame: np.ndarray, size: int) -> np.ndarray: - """ - Preparing frame before Encoder. - The image should be scaled to its shortest dimension at "size" - and cropped, centered, and squared so that both width and - height have lengths "size". The frame must be transposed from - Height-Width-Channels (HWC) to Channels-Height-Width (CHW). - - :param frame: input frame - :param size: input size to encoder model - :returns: resized and cropped frame - """ - # Adaptative resize - preprocessed = adaptive_resize(frame, size) - # Center_crop - (preprocessed, roi) = center_crop(preprocessed) - # Transpose frame HWC -> CHW - preprocessed = preprocessed.transpose((2, 0, 1))[None,] # HWC -> CHW - return preprocessed, roi - - - def encoder(preprocessed: np.ndarray, compiled_model: CompiledModel) -> List: - """ - Encoder Inference per frame. This function calls the network previously - configured for the encoder model (compiled_model), extracts the data - from the output node, and appends it in an array to be used by the decoder. - - :param: preprocessed: preprocessing frame - :param: compiled_model: Encoder model network - :returns: encoder_output: embedding layer that is appended with each arriving frame - """ - output_key_en = compiled_model.output(0) - - # Get results on action-recognition-0001-encoder model - infer_result_encoder = compiled_model([preprocessed])[output_key_en] - return infer_result_encoder - - - def decoder(encoder_output: List, compiled_model_de: CompiledModel) -> List: - """ - Decoder inference per set of frames. This function concatenates the embedding layer - froms the encoder output, transpose the array to match with the decoder input size. - Calls the network previously configured for the decoder model (compiled_model_de), extracts - the logits and normalize those to get confidence values along specified axis. - Decodes top probabilities into corresponding label names - - :param: encoder_output: embedding layer for 16 frames - :param: compiled_model_de: Decoder model network - :returns: decoded_labels: The k most probable actions from the labels list - decoded_top_probs: confidence for the k most probable actions - """ - # Concatenate sample_duration frames in just one array - decoder_input = np.concatenate(encoder_output, axis=0) - # Organize input shape vector to the Decoder (shape: [1x16x512]] - decoder_input = decoder_input.transpose((2, 0, 1, 3)) - decoder_input = np.squeeze(decoder_input, axis=3) - output_key_de = compiled_model_de.output(0) - # Get results on action-recognition-0001-decoder model - result_de = compiled_model_de([decoder_input])[output_key_de] - # Normalize logits to get confidence values along specified axis - probs = softmax(result_de - np.max(result_de)) - # Decodes top probabilities into corresponding label names - decoded_labels, decoded_top_probs = decode_output(probs, labels, top_k=3) - return decoded_labels, decoded_top_probs - - - def softmax(x: np.ndarray) -> np.ndarray: - """ - Normalizes logits to get confidence values along specified axis - x: np.array, axis=None - """ - exp = np.exp(x) - return exp / np.sum(exp, axis=None) - -Main Processing Function -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Running action recognition function will run in different operations, -either a webcam or a video file. See the list of procedures below: - -1. Create a video player to play with target fps - (``utils.VideoPlayer``). -2. Prepare a set of frames to be encoded-decoded. -3. Run AI functions -4. Visualize the results. - -.. code:: ipython3 - - def run_action_recognition( - source: str = "0", - flip: bool = True, - use_popup: bool = False, - compiled_model_en: CompiledModel = compiled_model_en, - compiled_model_de: CompiledModel = compiled_model_de, - skip_first_frames: int = 0, - ): - """ - Use the "source" webcam or video file to run the complete pipeline for action-recognition problem - 1. Create a video player to play with target fps - 2. Prepare a set of frames to be encoded-decoded - 3. Preprocess frame before Encoder - 4. Encoder Inference per frame - 5. Decoder inference per set of frames - 6. Visualize the results - - :param: source: webcam "0" or video path - :param: flip: to be used by VideoPlayer function for flipping capture image - :param: use_popup: False for showing encoded frames over this notebook, True for creating a popup window. - :param: skip_first_frames: Number of frames to skip at the beginning of the video. - :returns: display video over the notebook or in a popup window - - """ - size = height_en # Endoder input size - From Cell 5_9 - sample_duration = frames2decode # Decoder input size - From Cell 5_7 - # Select frames per second of your source. - fps = 30 - player = None - try: - # Create a video player. - player = utils.VideoPlayer(source, flip=flip, fps=fps, skip_first_frames=skip_first_frames) - # Start capturing. - player.start() - if use_popup: - title = "Press ESC to Exit" - cv2.namedWindow(title, cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE) - - processing_times = collections.deque() - processing_time = 0 - encoder_output = [] - decoded_labels = [0, 0, 0] - decoded_top_probs = [0, 0, 0] - counter = 0 - # Create a text template to show inference results over video. - text_inference_template = "Infer Time:{Time:.1f}ms,{fps:.1f}FPS" - text_template = "{label},{conf:.2f}%" - - while True: - counter = counter + 1 - - # Read a frame from the video stream. - frame = player.next() - if frame is None: - print("Source ended") - break - - scale = 1280 / max(frame.shape) - - # Adaptative resize for visualization. - if scale < 1: - frame = cv2.resize(frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA) - - # Select one frame every two for processing through the encoder. - # After 16 frames are processed, the decoder will find the action, - # and the label will be printed over the frames. - - if counter % 2 == 0: - # Preprocess frame before Encoder. - (preprocessed, _) = preprocessing(frame, size) - - # Measure processing time. - start_time = time.time() - - # Encoder Inference per frame - encoder_output.append(encoder(preprocessed, compiled_model_en)) - - # Decoder inference per set of frames - # Wait for sample duration to work with decoder model. - if len(encoder_output) == sample_duration: - decoded_labels, decoded_top_probs = decoder(encoder_output, compiled_model_de) - encoder_output = [] - - # Inference has finished. Display the results. - stop_time = time.time() - - # Calculate processing time. - processing_times.append(stop_time - start_time) - - # Use processing times from last 200 frames. - if len(processing_times) > 200: - processing_times.popleft() - - # Mean processing time [ms] - processing_time = np.mean(processing_times) * 1000 - fps = 1000 / processing_time - - # Visualize the results. - for i in range(0, 3): - display_text = text_template.format( - label=decoded_labels[i], - conf=decoded_top_probs[i] * 100, - ) - display_text_fnc(frame, display_text, i) - - display_text = text_inference_template.format(Time=processing_time, fps=fps) - display_text_fnc(frame, display_text, 3) - - # Use this workaround if you experience flickering. - if use_popup: - cv2.imshow(title, frame) - key = cv2.waitKey(1) - # escape = 27 - if key == 27: - break - else: - # Encode numpy array to jpg. - _, encoded_img = cv2.imencode(".jpg", frame, params=[cv2.IMWRITE_JPEG_QUALITY, 90]) - # Create an IPython image. - i = display.Image(data=encoded_img) - # Display the image in this notebook. - display.clear_output(wait=True) - display.display(i) - - # ctrl-c - except KeyboardInterrupt: - print("Interrupted") - # Any different error - except RuntimeError as e: - print(e) - finally: - if player is not None: - # Stop capturing. - player.stop() - if use_popup: - cv2.destroyAllWindows() - -Run Action Recognition -~~~~~~~~~~~~~~~~~~~~~~ - - - -Find out how the model works in a video file. `Any format -supported `__ -by OpenCV will work. You can press the stop button anytime while the -video file is running, and it will activate the webcam for the next -step. - - **NOTE**: Sometimes, the video can be cut off if there are corrupted - frames. In that case, you can convert it. If you experience any - problems with your video, use the - `HandBrake `__ and select the MPEG format. - -if you want to use a web camera as an input source for the demo, please -change the value of ``USE_WEBCAM`` variable to True and specify -``cam_id`` (the default value is 0, which can be different in -multi-camera systems). - -.. code:: ipython3 - - USE_WEBCAM = False - - cam_id = 0 - video_url = "https://archive.org/serve/ISSVideoResourceLifeOnStation720p/ISS%20Video%20Resource_LifeOnStation_720p.mp4" - video_file = Path("ISS%20Video%20Resource_LifeOnStation_720p.mp4") - - if not USE_WEBCAM and video_file.exists(): - utils.download_file(video_url, filename=video_file.name) - - source = cam_id if USE_WEBCAM else video_file - additional_options = {"skip_first_frames": 600, "flip": False} if not USE_WEBCAM else {"flip": True} - run_action_recognition(source=source, use_popup=False, **additional_options) - - -.. parsed-literal:: - - Cannot open ISS%20Video%20Resource_LifeOnStation_720p.mp4 - - -.. parsed-literal:: - - [ WARN:0@4.819] global cap.cpp:175 open VIDEOIO(CV_IMAGES): raised OpenCV exception: - - OpenCV(4.11.0) /io/opencv/modules/videoio/src/cap_images.cpp:237: error: (-5:Bad argument) CAP_IMAGES: error, expected '0?[1-9][du]' pattern, got: ISS%20Video%20Resource_LifeOnStation_720p.mp4 in function 'icvExtractPattern' - - - diff --git a/docs/notebooks/action-recognition-webcam-with-output_files/action-recognition-webcam-with-output_22_0.png b/docs/notebooks/action-recognition-webcam-with-output_files/action-recognition-webcam-with-output_22_0.png deleted file mode 100644 index ab9fc2a5beaeb8..00000000000000 --- a/docs/notebooks/action-recognition-webcam-with-output_files/action-recognition-webcam-with-output_22_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f97e26d51ee25eff20355e8151fa22506f49f7eb4c168e1db6a8214b6d4c75e2 -size 70131 diff --git a/docs/notebooks/all_notebooks_paths.txt b/docs/notebooks/all_notebooks_paths.txt deleted file mode 100644 index a0e55085a58730..00000000000000 --- a/docs/notebooks/all_notebooks_paths.txt +++ /dev/null @@ -1,177 +0,0 @@ -notebooks/3D-pose-estimation-webcam/3D-pose-estimation.ipynb -notebooks/3D-segmentation-point-clouds/3D-segmentation-point-clouds.ipynb -notebooks/action-recognition-webcam/action-recognition-webcam.ipynb -notebooks/animate-anyone/animate-anyone.ipynb -notebooks/async-api/async-api.ipynb -notebooks/auto-device/auto-device.ipynb -notebooks/bark-text-to-audio/bark-text-to-audio.ipynb -notebooks/big-transfer-quantization/tensorflow-bit-image-classification-nncf-quantization.ipynb -notebooks/blip-visual-language-processing/blip-visual-language-processing.ipynb -notebooks/catvton/catvton.ipynb -notebooks/clip-language-saliency-map/clip-language-saliency-map.ipynb -notebooks/clip-zero-shot-image-classification/clip-zero-shot-classification.ipynb -notebooks/controlnet-stable-diffusion/controlnet-stable-diffusion.ipynb -notebooks/convert-to-openvino/convert-to-openvino.ipynb -notebooks/ct-segmentation-quantize/ct-segmentation-quantize-nncf.ipynb -notebooks/ddcolor-image-colorization/ddcolor-image-colorization.ipynb -notebooks/deepseek-r1/deepseek-r1.ipynb -notebooks/depth-anything/depth-anything.ipynb -notebooks/depth-anything/depth-anything-v2.ipynb -notebooks/detectron2-to-openvino/detectron2-to-openvino.ipynb -notebooks/distil-whisper-asr/distil-whisper-asr.ipynb -notebooks/dynamicrafter-animating-images/dynamicrafter-animating-images.ipynb -notebooks/efficient-sam/efficient-sam.ipynb -notebooks/encodec-audio-compression/encodec-audio-compression.ipynb -notebooks/explainable-ai-1-basic/explainable-ai-1-basic.ipynb -notebooks/explainable-ai-2-deep-dive/explainable-ai-2-deep-dive.ipynb -notebooks/explainable-ai-3-map-interpretation/explainable-ai-3-map-interpretation.ipynb -notebooks/fast-segment-anything/fast-segment-anything.ipynb -notebooks/film-slowmo/film-slowmo.ipynb -notebooks/florence2/florence2.ipynb -notebooks/flux.1-image-generation/flux.1-image-generation.ipynb -notebooks/freevc-voice-conversion/freevc-voice-conversion.ipynb -notebooks/glm-edge-v/glm-edge-v.ipynb -notebooks/gpu-device/gpu-device.ipynb -notebooks/grammar-correction/grammar-correction.ipynb -notebooks/grounded-segment-anything/grounded-segment-anything.ipynb -notebooks/handwritten-ocr/handwritten-ocr.ipynb -notebooks/hello-detection/hello-detection.ipynb -notebooks/hello-npu/hello-npu.ipynb -notebooks/hello-segmentation/hello-segmentation.ipynb -notebooks/hello-world/hello-world.ipynb -notebooks/hugging-face-hub/hugging-face-hub.ipynb -notebooks/hunyuan-dit-image-generation/hunyuan-dit-image-generation.ipynb -notebooks/image-classification-quantization/image-classification-quantization.ipynb -notebooks/image-to-image-genai/image-to-image-genai.ipynb -notebooks/inpainting-genai/inpainting-genai.ipynb -notebooks/instant-id/instant-id.ipynb -notebooks/instruct-pix2pix-image-editing/instruct-pix2pix-image-editing.ipynb -notebooks/internvl2/internvl2.ipynb -notebooks/janus-multimodal-generation/janus-multimodal-generation.ipynb -notebooks/jax-to-openvino/jax-classification-to-openvino.ipynb -notebooks/jina-clip/jina-clip.ipynb -notebooks/knowledge-graphs-conve/knowledge-graphs-conve.ipynb -notebooks/kosmos2-multimodal-large-language-model/kosmos2-multimodal-large-language-model.ipynb -notebooks/language-quantize-bert/language-quantize-bert.ipynb -notebooks/latent-consistency-models-image-generation/latent-consistency-models-image-generation.ipynb -notebooks/latent-consistency-models-image-generation/lcm-lora-controlnet.ipynb -notebooks/llava-multimodal-chatbot/llava-multimodal-chatbot-genai.ipynb -notebooks/llava-multimodal-chatbot/llava-multimodal-chatbot-optimum.ipynb -notebooks/llava-next-multimodal-chatbot/llava-next-multimodal-chatbot.ipynb -notebooks/llm-agent-functioncall/llm-agent-functioncall-qwen.ipynb -notebooks/llm-agent-react/llm-agent-rag-llamaindex.ipynb -notebooks/llm-agent-react/llm-agent-react.ipynb -notebooks/llm-agent-react/llm-agent-react-langchain.ipynb -notebooks/llm-chatbot/llm-chatbot-generate-api.ipynb -notebooks/llm-chatbot/llm-chatbot.ipynb -notebooks/llm-question-answering/llm-question-answering.ipynb -notebooks/llm-rag-langchain/llm-rag-langchain-genai.ipynb -notebooks/llm-rag-langchain/llm-rag-langchain.ipynb -notebooks/llm-rag-llamaindex/llm-rag-llamaindex.ipynb -notebooks/localai/localai.ipynb -notebooks/ltx-video/ltx-video.ipynb -notebooks/magika-content-type-recognition/magika-content-type-recognition.ipynb -notebooks/meter-reader/meter-reader.ipynb -notebooks/minicpm-v-multimodal-chatbot/minicpm-v-multimodal-chatbot.ipynb -notebooks/mllama-3.2/mllama-3.2.ipynb -notebooks/mms-massively-multilingual-speech/mms-massively-multilingual-speech.ipynb -notebooks/mobileclip-video-search/mobileclip-video-search.ipynb -notebooks/modelscope-to-openvino/modelscope-to-openvino.ipynb -notebooks/model-server/model-server.ipynb -notebooks/multilora-image-generation/multilora-image-generation.ipynb -notebooks/multimodal-rag/multimodal-rag-llamaindex.ipynb -notebooks/music-generation/music-generation.ipynb -notebooks/named-entity-recognition/named-entity-recognition.ipynb -notebooks/nano-llava-multimodal-chatbot/nano-llava-multimodal-chatbot.ipynb -notebooks/nuextract-structure-extraction/nuextract-structure-extraction.ipynb -notebooks/object-detection-webcam/object-detection.ipynb -notebooks/omnigen/omnigen.ipynb -notebooks/omniparser/omniparser.ipynb -notebooks/oneformer-segmentation/oneformer-segmentation.ipynb -notebooks/openvino-api/openvino-api.ipynb -notebooks/openvino-tokenizers/openvino-tokenizers.ipynb -notebooks/openvoice/openvoice.ipynb -notebooks/optical-character-recognition/optical-character-recognition.ipynb -notebooks/optimize-preprocessing/optimize-preprocessing.ipynb -notebooks/outetts-text-to-speech/outetts-text-to-speech.ipynb -notebooks/paddle-ocr-webcam/paddle-ocr-webcam.ipynb -notebooks/paddle-to-openvino/paddle-to-openvino-classification.ipynb -notebooks/parler-tts-text-to-speech/parler-tts-text-to-speech.ipynb -notebooks/person-counting-webcam/person-counting.ipynb -notebooks/person-tracking-webcam/person-tracking.ipynb -notebooks/phi-3-vision/phi-3-vision.ipynb -notebooks/photo-maker/photo-maker.ipynb -notebooks/pix2struct-docvqa/pix2struct-docvqa.ipynb -notebooks/pixart/pixart.ipynb -notebooks/pixtral/pixtral.ipynb -notebooks/pose-estimation-webcam/pose-estimation.ipynb -notebooks/pytorch-post-training-quantization-nncf/pytorch-post-training-quantization-nncf.ipynb -notebooks/pytorch-quantization-aware-training/pytorch-quantization-aware-training.ipynb -notebooks/pytorch-quantization-sparsity-aware-training/pytorch-quantization-sparsity-aware-training.ipynb -notebooks/pytorch-to-openvino/pytorch-onnx-to-openvino.ipynb -notebooks/pytorch-to-openvino/pytorch-to-openvino.ipynb -notebooks/qrcode-monster/qrcode-monster.ipynb -notebooks/quantizing-model-with-accuracy-control/speech-recognition-quantization-wav2vec2.ipynb -notebooks/quantizing-model-with-accuracy-control/yolov11-quantization-with-accuracy-control.ipynb -notebooks/qwen2-audio/qwen2-audio.ipynb -notebooks/qwen2-vl/qwen2-vl.ipynb -notebooks/riffusion-text-to-music/riffusion-text-to-music.ipynb -notebooks/rmbg-background-removal/rmbg-background-removal.ipynb -notebooks/s3d-mil-nce-text-to-video-retrieval/s3d-mil-nce-text-to-video-retrieval.ipynb -notebooks/sam2-image-segmentation/segment-anything-2-image.ipynb -notebooks/sam2-video-segmentation/segment-anything-2-video.ipynb -notebooks/sana-image-generation/sana-image-generation.ipynb -notebooks/sdxl-turbo/sdxl-turbo.ipynb -notebooks/segment-anything/segment-anything.ipynb -notebooks/siglip-zero-shot-image-classification/siglip-zero-shot-image-classification.ipynb -notebooks/sketch-to-image-pix2pix-turbo/sketch-to-image-pix2pix-turbo.ipynb -notebooks/softvc-voice-conversion/softvc-voice-conversion.ipynb -notebooks/sound-generation-audioldm2/sound-generation-audioldm2.ipynb -notebooks/sparsity-optimization/sparsity-optimization.ipynb -notebooks/speculative-sampling/speculative-sampling.ipynb -notebooks/speechbrain-emotion-recognition/speechbrain-emotion-recognition.ipynb -notebooks/speech-recognition-quantization/speech-recognition-quantization-wav2vec2.ipynb -notebooks/stable-audio/stable-audio.ipynb -notebooks/stable-cascade-image-generation/stable-cascade-image-generation.ipynb -notebooks/stable-diffusion-ip-adapter/stable-diffusion-ip-adapter.ipynb -notebooks/stable-diffusion-keras-cv/stable-diffusion-keras-cv.ipynb -notebooks/stable-diffusion-text-to-image/stable-diffusion-text-to-image.ipynb -notebooks/stable-diffusion-torchdynamo-backend/stable-diffusion-torchdynamo-backend.ipynb -notebooks/stable-diffusion-v2/stable-diffusion-v2-infinite-zoom.ipynb -notebooks/stable-diffusion-v2/stable-diffusion-v2-optimum-demo.ipynb -notebooks/stable-diffusion-v2/stable-diffusion-v2-text-to-image-demo.ipynb -notebooks/stable-diffusion-v2/stable-diffusion-v2-text-to-image.ipynb -notebooks/stable-diffusion-v3/stable-diffusion-v3.ipynb -notebooks/stable-diffusion-v3/stable-diffusion-v3-torch-fx.ipynb -notebooks/stable-diffusion-xl/stable-diffusion-xl.ipynb -notebooks/stable-fast-3d/stable-fast-3d.ipynb -notebooks/stable-video-diffusion/stable-video-diffusion.ipynb -notebooks/style-transfer-webcam/style-transfer.ipynb -notebooks/surya-line-level-text-detection/surya-line-level-text-detection.ipynb -notebooks/table-question-answering/table-question-answering.ipynb -notebooks/tensorflow-classification-to-openvino/tensorflow-classification-to-openvino.ipynb -notebooks/tensorflow-hub/tensorflow-hub.ipynb -notebooks/tensorflow-object-detection-to-openvino/tensorflow-instance-segmentation-to-openvino.ipynb -notebooks/tensorflow-object-detection-to-openvino/tensorflow-object-detection-to-openvino.ipynb -notebooks/tensorflow-quantization-aware-training/tensorflow-quantization-aware-training.ipynb -notebooks/text-to-image-genai/text-to-image-genai.ipynb -notebooks/tflite-selfie-segmentation/tflite-selfie-segmentation.ipynb -notebooks/tflite-to-openvino/tflite-to-openvino.ipynb -notebooks/tiny-sd-image-generation/tiny-sd-image-generation.ipynb -notebooks/torchvision-zoo-to-openvino/convnext-classification.ipynb -notebooks/vehicle-detection-and-recognition/vehicle-detection-and-recognition.ipynb -notebooks/vision-background-removal/vision-background-removal.ipynb -notebooks/vision-monodepth/vision-monodepth.ipynb -notebooks/wav2lip/wav2lip.ipynb -notebooks/whisper-asr-genai/whisper-asr-genai.ipynb -notebooks/whisper-subtitles-generation/whisper-subtitles-generation.ipynb -notebooks/yolov10-optimization/yolov10-optimization.ipynb -notebooks/yolov11-optimization/yolov11-instance-segmentation.ipynb -notebooks/yolov11-optimization/yolov11-keypoint-detection.ipynb -notebooks/yolov11-optimization/yolov11-object-detection.ipynb -notebooks/yolov8-optimization/yolov8-instance-segmentation.ipynb -notebooks/yolov8-optimization/yolov8-keypoint-detection.ipynb -notebooks/yolov8-optimization/yolov8-obb.ipynb -notebooks/yolov8-optimization/yolov8-object-detection.ipynb -notebooks/yolov9-optimization/yolov9-optimization.ipynb -notebooks/zeroscope-text2video/zeroscope-text2video.ipynb diff --git a/docs/notebooks/animate-anyone-with-output.rst b/docs/notebooks/animate-anyone-with-output.rst deleted file mode 100644 index fe176ddb066623..00000000000000 --- a/docs/notebooks/animate-anyone-with-output.rst +++ /dev/null @@ -1,1266 +0,0 @@ -Image-to-Video synthesis with AnimateAnyone and OpenVINO -======================================================== - -|image0| - -`AnimateAnyone `__ tackles the -task of generating animation sequences from a single character image. It -builds upon diffusion models pre-trained on vast character image -datasets. - -The core of AnimateAnyone is a diffusion model pre-trained on a massive -dataset of character images. This model learns the underlying character -representation and distribution, allowing for realistic and diverse -character animation. To capture the specific details and characteristics -of the input character image, AnimateAnyone incorporates a ReferenceNet -module. This module acts like an attention mechanism, focusing on the -input image and guiding the animation process to stay consistent with -the original character’s appearance. AnimateAnyone enables control over -the character’s pose during animation. This might involve using -techniques like parametric pose embedding or direct pose vector input, -allowing for the creation of various character actions and movements. To -ensure smooth transitions and temporal coherence throughout the -animation sequence, AnimateAnyone incorporates temporal modeling -techniques. This may involve recurrent architectures like LSTMs or -transformers that capture the temporal dependencies between video -frames. - -Overall, AnimateAnyone combines a powerful pre-trained diffusion model -with a character-specific attention mechanism (ReferenceNet), pose -guidance, and temporal modeling to achieve controllable, high-fidelity -character animation from a single image. - -Learn more in `GitHub -repo `__ and -`paper `__. - -.. container:: alert alert-warning - - :: - -

! WARNING !

-

- This tutorial requires at least 96 GB of RAM for model conversion and 40 GB for inference. Changing the values of HEIGHT, WIDTH and VIDEO_LENGTH variables will change the memory consumption but will also affect accuracy. -

- - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Prepare base model <#prepare-base-model>`__ -- `Prepare image encoder <#prepare-image-encoder>`__ -- `Download weights <#download-weights>`__ -- `Initialize models <#initialize-models>`__ -- `Load pretrained weights <#load-pretrained-weights>`__ -- `Convert model to OpenVINO IR <#convert-model-to-openvino-ir>`__ - - - `VAE <#vae>`__ - - `Reference UNet <#reference-unet>`__ - - `Denoising UNet <#denoising-unet>`__ - - `Pose Guider <#pose-guider>`__ - - `Image Encoder <#image-encoder>`__ - -- `Inference <#inference>`__ -- `Video post-processing <#video-post-processing>`__ -- `Interactive inference <#interactive-inference>`__ - - - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. |image0| image:: https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/animate-anyone/animate-anyone.gif - - -Prerequisites -------------- - - - -.. code:: ipython3 - - from pathlib import Path - import requests - - - %pip install -q "torch>=2.1" torchvision einops omegaconf "diffusers<=0.24" "huggingface-hub<0.26.0" transformers av accelerate "gradio>=4.19" --extra-index-url "https://download.pytorch.org/whl/cpu" - %pip install -q "openvino>=2024.0" "nncf>=2.9.0" - - helpers = ["skip_kernel_extension.py", "notebook_utils.py", "cmd_helper.py"] - - for file_name in helpers: - if not Path(file_name).exists(): - r = requests.get( - url=f"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/{file_name}", - ) - open(file_name, "w").write(r.text) - - - from cmd_helper import clone_repo - - clone_repo("https://github.com/itrushkin/Moore-AnimateAnyone.git") - - %load_ext skip_kernel_extension - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("animate-anyone.ipynb") - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - - -Note that we clone a fork of original repo with tweaked forward methods. - -.. code:: ipython3 - - MODEL_DIR = Path("models") - VAE_ENCODER_PATH = MODEL_DIR / "vae_encoder.xml" - VAE_DECODER_PATH = MODEL_DIR / "vae_decoder.xml" - REFERENCE_UNET_PATH = MODEL_DIR / "reference_unet.xml" - DENOISING_UNET_PATH = MODEL_DIR / "denoising_unet.xml" - POSE_GUIDER_PATH = MODEL_DIR / "pose_guider.xml" - IMAGE_ENCODER_PATH = MODEL_DIR / "image_encoder.xml" - - WIDTH = 448 - HEIGHT = 512 - VIDEO_LENGTH = 24 - - SHOULD_CONVERT = not all( - p.exists() - for p in [ - VAE_ENCODER_PATH, - VAE_DECODER_PATH, - REFERENCE_UNET_PATH, - DENOISING_UNET_PATH, - POSE_GUIDER_PATH, - IMAGE_ENCODER_PATH, - ] - ) - -.. code:: ipython3 - - from datetime import datetime - from typing import Optional, Union, List, Callable - import math - - from PIL import Image - import openvino as ov - from torchvision import transforms - from einops import repeat - from tqdm.auto import tqdm - from einops import rearrange - from omegaconf import OmegaConf - from diffusers import DDIMScheduler - from diffusers.image_processor import VaeImageProcessor - from transformers import CLIPImageProcessor - import torch - - from src.pipelines.pipeline_pose2vid_long import Pose2VideoPipeline - from src.utils.util import get_fps, read_frames - from src.utils.util import save_videos_grid - from src.pipelines.context import get_context_scheduler - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/utils/outputs.py:63: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. - torch.utils._pytree._register_pytree_node( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/utils/outputs.py:63: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. - torch.utils._pytree._register_pytree_node( - - -.. code:: ipython3 - - %%skip not $SHOULD_CONVERT - from pathlib import PurePosixPath - import gc - import warnings - - from typing import Dict, Any - from diffusers import AutoencoderKL - from huggingface_hub import hf_hub_download, snapshot_download - from transformers import CLIPVisionModelWithProjection - import nncf - - from src.models.unet_2d_condition import UNet2DConditionModel - from src.models.unet_3d import UNet3DConditionModel - from src.models.pose_guider import PoseGuider - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, openvino - - -Prepare base model ------------------- - - - -.. code:: ipython3 - - %%skip not $SHOULD_CONVERT - local_dir = Path("./pretrained_weights/stable-diffusion-v1-5") - local_dir.mkdir(parents=True, exist_ok=True) - for hub_file in ["unet/config.json", "unet/diffusion_pytorch_model.bin"]: - saved_path = local_dir / hub_file - if saved_path.exists(): - continue - hf_hub_download( - repo_id="botp/stable-diffusion-v1-5", - subfolder=PurePosixPath(saved_path.parent.name), - filename=PurePosixPath(saved_path.name), - local_dir=local_dir, - ) - - - -.. parsed-literal:: - - diffusion_pytorch_model.bin: 0%| | 0.00/3.44G [00:00:2: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. - :6: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. - :9: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. - - -Convert model to OpenVINO IR ----------------------------- - -The pose sequence is initially -encoded using Pose Guider and fused with multi-frame noise, followed by -the Denoising UNet conducting the denoising process for video -generation. The computational block of the Denoising UNet consists of -Spatial-Attention, Cross-Attention, and Temporal-Attention, as -illustrated in the dashed box on the right. The integration of reference -image involves two aspects. Firstly, detailed features are extracted -through ReferenceNet and utilized for Spatial-Attention. Secondly, -semantic features are extracted through the CLIP image encoder for -Cross-Attention. Temporal-Attention operates in the temporal dimension. -Finally, the VAE decoder decodes the result into a video clip. - -|image01| - -The pipeline contains 6 PyTorch modules: - -- VAE encoder -- VAE decoder -- Image encoder -- Reference UNet -- Denoising UNet -- Pose Guider - -For reducing memory consumption, weights compression optimization can be -applied using `NNCF `__. Weight -compression aims to reduce the memory footprint of a model. models, -which require extensive memory to store the weights during inference, -can benefit from weight compression in the following ways: - -- enabling the inference of exceptionally large models that cannot be - accommodated in the memory of the device; - -- improving the inference performance of the models by reducing the - latency of the memory access when computing the operations with - weights, for example, Linear layers. - -`Neural Network Compression Framework -(NNCF) `__ provides 4-bit / -8-bit mixed weight quantization as a compression method. The main -difference between weights compression and full model quantization -(post-training quantization) is that activations remain floating-point -in the case of weights compression which leads to a better accuracy. In -addition, weight compression is data-free and does not require a -calibration dataset, making it easy to use. - -``nncf.compress_weights`` function can be used for performing weights -compression. The function accepts an OpenVINO model and other -compression parameters. - -More details about weights compression can be found in `OpenVINO -documentation `__. - -.. |image01| image:: https://humanaigc.github.io/animate-anyone/static/images/f2_img.png - -.. code:: ipython3 - - %%skip not $SHOULD_CONVERT - def cleanup_torchscript_cache(): - """ - Helper for removing cached model representation - """ - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - -.. code:: ipython3 - - %%skip not $SHOULD_CONVERT - warnings.simplefilter("ignore", torch.jit.TracerWarning) - -VAE -~~~ - - - -The VAE model has two parts, an encoder and a decoder. The encoder is -used to convert the image into a low dimensional latent representation, -which will serve as the input to the U-Net model. The decoder, -conversely, transforms the latent representation back into an image. - -During latent diffusion training, the encoder is used to get the latent -representations (latents) of the images for the forward diffusion -process, which applies more and more noise at each step. During -inference, the denoised latents generated by the reverse diffusion -process are converted back into images using the VAE decoder. - -As the encoder and the decoder are used independently in different parts -of the pipeline, it will be better to convert them to separate models. - -.. code:: ipython3 - - %%skip not $SHOULD_CONVERT - if not VAE_ENCODER_PATH.exists(): - class VaeEncoder(torch.nn.Module): - def __init__(self, vae): - super().__init__() - self.vae = vae - - def forward(self, x): - return self.vae.encode(x).latent_dist.mean - vae.eval() - with torch.no_grad(): - vae_encoder = ov.convert_model(VaeEncoder(vae), example_input=torch.zeros(1,3,512,448)) - vae_encoder = nncf.compress_weights(vae_encoder) - ov.save_model(vae_encoder, VAE_ENCODER_PATH) - del vae_encoder - cleanup_torchscript_cache() - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (32 / 32) │ 100% (32 / 32) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - - - -.. parsed-literal:: - - Output() - - - - - - - - - -.. code:: ipython3 - - %%skip not $SHOULD_CONVERT - if not VAE_DECODER_PATH.exists(): - class VaeDecoder(torch.nn.Module): - def __init__(self, vae): - super().__init__() - self.vae = vae - - def forward(self, z): - return self.vae.decode(z).sample - vae.eval() - with torch.no_grad(): - vae_decoder = ov.convert_model(VaeDecoder(vae), example_input=torch.zeros(1,4,HEIGHT//8,WIDTH//8)) - vae_decoder = nncf.compress_weights(vae_decoder) - ov.save_model(vae_decoder, VAE_DECODER_PATH) - del vae_decoder - cleanup_torchscript_cache() - del vae - gc.collect() - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (40 / 40) │ 100% (40 / 40) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - - - -.. parsed-literal:: - - Output() - - - - - - - - - -Reference UNet -~~~~~~~~~~~~~~ - - - -Pipeline extracts reference attention features from all transformer -blocks inside Reference UNet model. We call the original forward pass to -obtain shapes of the outputs as they will be used in the next pipeline -step. - -.. code:: ipython3 - - %%skip not $SHOULD_CONVERT - if not REFERENCE_UNET_PATH.exists(): - class ReferenceUNetWrapper(torch.nn.Module): - def __init__(self, reference_unet): - super().__init__() - self.reference_unet = reference_unet - - def forward(self, sample, timestep, encoder_hidden_states): - return self.reference_unet(sample, timestep, encoder_hidden_states, return_dict=False)[1] - - sample = torch.zeros(2, 4, HEIGHT // 8, WIDTH // 8) - timestep = torch.tensor(0) - encoder_hidden_states = torch.zeros(2, 1, 768) - reference_unet.eval() - with torch.no_grad(): - wrapper = ReferenceUNetWrapper(reference_unet) - example_input = (sample, timestep, encoder_hidden_states) - ref_features_shapes = {k: v.shape for k, v in wrapper(*example_input).items()} - ov_reference_unet = ov.convert_model( - wrapper, - example_input=example_input, - ) - ov_reference_unet = nncf.compress_weights(ov_reference_unet) - ov.save_model(ov_reference_unet, REFERENCE_UNET_PATH) - del ov_reference_unet - del wrapper - cleanup_torchscript_cache() - del reference_unet - gc.collect() - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (270 / 270) │ 100% (270 / 270) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - - - -.. parsed-literal:: - - Output() - - - - - - - - - -Denoising UNet -~~~~~~~~~~~~~~ - - - -Denoising UNet is the main part of all diffusion pipelines. This model -consumes the majority of memory, so we need to reduce its size as much -as possible. - -Here we make all shapes static meaning that the size of the video will -be constant. - -Also, we use the ``ref_features`` input with the same tensor shapes as -output of `Reference UNet <#reference-unet>`__ model on the previous -step. - -.. code:: ipython3 - - %%skip not $SHOULD_CONVERT - if not DENOISING_UNET_PATH.exists(): - class DenoisingUNetWrapper(torch.nn.Module): - def __init__(self, denoising_unet): - super().__init__() - self.denoising_unet = denoising_unet - - def forward( - self, - sample, - timestep, - encoder_hidden_states, - pose_cond_fea, - ref_features - ): - return self.denoising_unet( - sample, - timestep, - encoder_hidden_states, - ref_features, - pose_cond_fea=pose_cond_fea, - return_dict=False) - - example_input = { - "sample": torch.zeros(2, 4, VIDEO_LENGTH, HEIGHT // 8, WIDTH // 8), - "timestep": torch.tensor(999), - "encoder_hidden_states": torch.zeros(2,1,768), - "pose_cond_fea": torch.zeros(2, 320, VIDEO_LENGTH, HEIGHT // 8, WIDTH // 8), - "ref_features": {k: torch.zeros(shape) for k, shape in ref_features_shapes.items()} - } - - denoising_unet.eval() - with torch.no_grad(): - ov_denoising_unet = ov.convert_model( - DenoisingUNetWrapper(denoising_unet), - example_input=tuple(example_input.values()) - ) - ov_denoising_unet.inputs[0].get_node().set_partial_shape(ov.PartialShape((2, 4, VIDEO_LENGTH, HEIGHT // 8, WIDTH // 8))) - ov_denoising_unet.inputs[2].get_node().set_partial_shape(ov.PartialShape((2, 1, 768))) - ov_denoising_unet.inputs[3].get_node().set_partial_shape(ov.PartialShape((2, 320, VIDEO_LENGTH, HEIGHT // 8, WIDTH // 8))) - for ov_input, shape in zip(ov_denoising_unet.inputs[4:], ref_features_shapes.values()): - ov_input.get_node().set_partial_shape(ov.PartialShape(shape)) - ov_input.get_node().set_element_type(ov.Type.f32) - ov_denoising_unet.validate_nodes_and_infer_types() - ov_denoising_unet = nncf.compress_weights(ov_denoising_unet) - ov.save_model(ov_denoising_unet, DENOISING_UNET_PATH) - del ov_denoising_unet - cleanup_torchscript_cache() - del denoising_unet - gc.collect() - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (534 / 534) │ 100% (534 / 534) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - - - -.. parsed-literal:: - - Output() - - - - - - - - - -Pose Guider -~~~~~~~~~~~ - - - -To ensure pose controllability, a lightweight pose guider is devised to -efficiently integrate pose control signals into the denoising process. - -.. code:: ipython3 - - %%skip not $SHOULD_CONVERT - if not POSE_GUIDER_PATH.exists(): - pose_guider.eval() - with torch.no_grad(): - ov_pose_guider = ov.convert_model(pose_guider, example_input=torch.zeros(1, 3, VIDEO_LENGTH, HEIGHT, WIDTH)) - ov_pose_guider = nncf.compress_weights(ov_pose_guider) - ov.save_model(ov_pose_guider, POSE_GUIDER_PATH) - del ov_pose_guider - cleanup_torchscript_cache() - del pose_guider - gc.collect() - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (8 / 8) │ 100% (8 / 8) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - - - -.. parsed-literal:: - - Output() - - - - - - - - - -Image Encoder -~~~~~~~~~~~~~ - - - -Pipeline uses CLIP image encoder to generate encoder hidden states -required for both reference and denoising UNets. - -.. code:: ipython3 - - %%skip not $SHOULD_CONVERT - if not IMAGE_ENCODER_PATH.exists(): - image_enc.eval() - with torch.no_grad(): - ov_image_encoder = ov.convert_model(image_enc, example_input=torch.zeros(1, 3, 224, 224), input=(1, 3, 224, 224)) - ov_image_encoder = nncf.compress_weights(ov_image_encoder) - ov.save_model(ov_image_encoder, IMAGE_ENCODER_PATH) - del ov_image_encoder - cleanup_torchscript_cache() - del image_enc - gc.collect() - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead - warnings.warn( - `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (146 / 146) │ 100% (146 / 146) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - - - -.. parsed-literal:: - - Output() - - - - - - - - - -Inference ---------- - - - -We inherit from the original pipeline modifying the calls to our models -to match OpenVINO format. - -.. code:: ipython3 - - core = ov.Core() - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -For starting work, please select inference device from dropdown list. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - -.. code:: ipython3 - - class OVPose2VideoPipeline(Pose2VideoPipeline): - def __init__( - self, - vae_encoder_path=VAE_ENCODER_PATH, - vae_decoder_path=VAE_DECODER_PATH, - image_encoder_path=IMAGE_ENCODER_PATH, - reference_unet_path=REFERENCE_UNET_PATH, - denoising_unet_path=DENOISING_UNET_PATH, - pose_guider_path=POSE_GUIDER_PATH, - device=device.value, - ): - self.vae_encoder = core.compile_model(vae_encoder_path, device) - self.vae_decoder = core.compile_model(vae_decoder_path, device) - self.image_encoder = core.compile_model(image_encoder_path, device) - self.reference_unet = core.compile_model(reference_unet_path, device) - self.denoising_unet = core.compile_model(denoising_unet_path, device) - self.pose_guider = core.compile_model(pose_guider_path, device) - self.scheduler = DDIMScheduler(**OmegaConf.to_container(infer_config.noise_scheduler_kwargs)) - - self.vae_scale_factor = 8 - self.clip_image_processor = CLIPImageProcessor() - self.ref_image_processor = VaeImageProcessor(do_convert_rgb=True) - self.cond_image_processor = VaeImageProcessor(do_convert_rgb=True, do_normalize=False) - - def decode_latents(self, latents): - video_length = latents.shape[2] - latents = 1 / 0.18215 * latents - latents = rearrange(latents, "b c f h w -> (b f) c h w") - # video = self.vae.decode(latents).sample - video = [] - for frame_idx in tqdm(range(latents.shape[0])): - video.append(torch.from_numpy(self.vae_decoder(latents[frame_idx : frame_idx + 1])[0])) - video = torch.cat(video) - video = rearrange(video, "(b f) c h w -> b c f h w", f=video_length) - video = (video / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16 - video = video.cpu().float().numpy() - return video - - def __call__( - self, - ref_image, - pose_images, - width, - height, - video_length, - num_inference_steps=30, - guidance_scale=3.5, - num_images_per_prompt=1, - eta: float = 0.0, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - output_type: Optional[str] = "tensor", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, - callback_steps: Optional[int] = 1, - context_schedule="uniform", - context_frames=24, - context_stride=1, - context_overlap=4, - context_batch_size=1, - interpolation_factor=1, - **kwargs, - ): - do_classifier_free_guidance = guidance_scale > 1.0 - - # Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps) - timesteps = self.scheduler.timesteps - - batch_size = 1 - - # Prepare clip image embeds - clip_image = self.clip_image_processor.preprocess(ref_image.resize((224, 224)), return_tensors="pt").pixel_values - clip_image_embeds = self.image_encoder(clip_image)["image_embeds"] - clip_image_embeds = torch.from_numpy(clip_image_embeds) - encoder_hidden_states = clip_image_embeds.unsqueeze(1) - uncond_encoder_hidden_states = torch.zeros_like(encoder_hidden_states) - - if do_classifier_free_guidance: - encoder_hidden_states = torch.cat([uncond_encoder_hidden_states, encoder_hidden_states], dim=0) - - latents = self.prepare_latents( - batch_size * num_images_per_prompt, - 4, - width, - height, - video_length, - clip_image_embeds.dtype, - torch.device("cpu"), - generator, - ) - - # Prepare extra step kwargs. - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - - # Prepare ref image latents - ref_image_tensor = self.ref_image_processor.preprocess(ref_image, height=height, width=width) # (bs, c, width, height) - ref_image_latents = self.vae_encoder(ref_image_tensor)[0] - ref_image_latents = ref_image_latents * 0.18215 # (b, 4, h, w) - ref_image_latents = torch.from_numpy(ref_image_latents) - - # Prepare a list of pose condition images - pose_cond_tensor_list = [] - for pose_image in pose_images: - pose_cond_tensor = self.cond_image_processor.preprocess(pose_image, height=height, width=width) - pose_cond_tensor = pose_cond_tensor.unsqueeze(2) # (bs, c, 1, h, w) - pose_cond_tensor_list.append(pose_cond_tensor) - pose_cond_tensor = torch.cat(pose_cond_tensor_list, dim=2) # (bs, c, t, h, w) - pose_fea = self.pose_guider(pose_cond_tensor)[0] - pose_fea = torch.from_numpy(pose_fea) - - context_scheduler = get_context_scheduler(context_schedule) - - # denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - noise_pred = torch.zeros( - ( - latents.shape[0] * (2 if do_classifier_free_guidance else 1), - *latents.shape[1:], - ), - device=latents.device, - dtype=latents.dtype, - ) - counter = torch.zeros( - (1, 1, latents.shape[2], 1, 1), - device=latents.device, - dtype=latents.dtype, - ) - - # 1. Forward reference image - if i == 0: - ref_features = self.reference_unet( - ( - ref_image_latents.repeat((2 if do_classifier_free_guidance else 1), 1, 1, 1), - torch.zeros_like(t), - # t, - encoder_hidden_states, - ) - ).values() - - context_queue = list( - context_scheduler( - 0, - num_inference_steps, - latents.shape[2], - context_frames, - context_stride, - 0, - ) - ) - num_context_batches = math.ceil(len(context_queue) / context_batch_size) - - context_queue = list( - context_scheduler( - 0, - num_inference_steps, - latents.shape[2], - context_frames, - context_stride, - context_overlap, - ) - ) - - num_context_batches = math.ceil(len(context_queue) / context_batch_size) - global_context = [] - for i in range(num_context_batches): - global_context.append(context_queue[i * context_batch_size : (i + 1) * context_batch_size]) - - for context in global_context: - # 3.1 expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents[:, :, c] for c in context]).repeat(2 if do_classifier_free_guidance else 1, 1, 1, 1, 1) - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - b, c, f, h, w = latent_model_input.shape - latent_pose_input = torch.cat([pose_fea[:, :, c] for c in context]).repeat(2 if do_classifier_free_guidance else 1, 1, 1, 1, 1) - - pred = self.denoising_unet( - ( - latent_model_input, - t, - encoder_hidden_states[:b], - latent_pose_input, - *ref_features, - ) - )[0] - - for j, c in enumerate(context): - noise_pred[:, :, c] = noise_pred[:, :, c] + pred - counter[:, :, c] = counter[:, :, c] + 1 - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = (noise_pred / counter).chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample - - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - if callback is not None and i % callback_steps == 0: - step_idx = i // getattr(self.scheduler, "order", 1) - callback(step_idx, t, latents) - - if interpolation_factor > 0: - latents = self.interpolate_latents(latents, interpolation_factor, latents.device) - # Post-processing - images = self.decode_latents(latents) # (b, c, f, h, w) - - # Convert to tensor - if output_type == "tensor": - images = torch.from_numpy(images) - - return images - -.. code:: ipython3 - - pipe = OVPose2VideoPipeline() - -.. code:: ipython3 - - pose_images = read_frames("Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4") - src_fps = get_fps("Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4") - ref_image = Image.open("Moore-AnimateAnyone/configs/inference/ref_images/anyone-5.png").convert("RGB") - pose_list = [] - for pose_image_pil in pose_images[:VIDEO_LENGTH]: - pose_list.append(pose_image_pil) - -.. code:: ipython3 - - video = pipe( - ref_image, - pose_list, - width=WIDTH, - height=HEIGHT, - video_length=VIDEO_LENGTH, - ) - - - -.. parsed-literal:: - - 0%| | 0/30 [00:00 b c (repeat f) h w", repeat=VIDEO_LENGTH) - pose_tensor = torch.stack(pose_tensor_list, dim=0) # (f, c, h, w) - pose_tensor = pose_tensor.transpose(0, 1) - pose_tensor = pose_tensor.unsqueeze(0) - video = torch.cat([ref_image_tensor, pose_tensor, video], dim=0) - - save_dir = Path("./output") - save_dir.mkdir(parents=True, exist_ok=True) - date_str = datetime.now().strftime("%Y%m%d") - time_str = datetime.now().strftime("%H%M") - out_path = save_dir / f"{date_str}T{time_str}.mp4" - save_videos_grid( - video, - str(out_path), - n_rows=3, - fps=src_fps, - ) - -.. code:: ipython3 - - from IPython.display import Video - - Video(out_path, embed=True) - - - - -.. raw:: html - - - - - -Interactive inference ---------------------- - - - -.. code:: ipython3 - - import gradio as gr - - - def generate( - img, - pose_vid, - seed, - guidance_scale, - num_inference_steps, - _=gr.Progress(track_tqdm=True), - ): - generator = torch.Generator().manual_seed(seed) - pose_list = read_frames(pose_vid)[:VIDEO_LENGTH] - video = pipe( - img, - pose_list, - width=WIDTH, - height=HEIGHT, - video_length=VIDEO_LENGTH, - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=num_inference_steps, - ) - new_h, new_w = video.shape[-2:] - pose_transform = transforms.Compose([transforms.Resize((new_h, new_w)), transforms.ToTensor()]) - pose_tensor_list = [] - for pose_image_pil in pose_list: - pose_tensor_list.append(pose_transform(pose_image_pil)) - - ref_image_tensor = pose_transform(img) # (c, h, w) - ref_image_tensor = ref_image_tensor.unsqueeze(1).unsqueeze(0) # (1, c, 1, h, w) - ref_image_tensor = repeat(ref_image_tensor, "b c f h w -> b c (repeat f) h w", repeat=VIDEO_LENGTH) - pose_tensor = torch.stack(pose_tensor_list, dim=0) # (f, c, h, w) - pose_tensor = pose_tensor.transpose(0, 1) - pose_tensor = pose_tensor.unsqueeze(0) - video = torch.cat([ref_image_tensor, pose_tensor, video], dim=0) - - save_dir = Path("./output/gradio") - save_dir.mkdir(parents=True, exist_ok=True) - date_str = datetime.now().strftime("%Y%m%d") - time_str = datetime.now().strftime("%H%M") - out_path = save_dir / f"{date_str}T{time_str}.mp4" - save_videos_grid( - video, - str(out_path), - n_rows=3, - fps=12, - ) - return out_path - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/animate-anyone/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=generate) - - try: - demo.queue().launch(debug=False) - except Exception: - demo.queue().launch(debug=False, share=True) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/" - - -.. parsed-literal:: - - Running on local URL: http://127.0.0.1:7860 - - To create a public link, set `share=True` in `launch()`. - - - - - - - diff --git a/docs/notebooks/async-api-with-output.rst b/docs/notebooks/async-api-with-output.rst deleted file mode 100644 index 0ae18989db5d4f..00000000000000 --- a/docs/notebooks/async-api-with-output.rst +++ /dev/null @@ -1,659 +0,0 @@ -Asynchronous Inference with OpenVINO™ -===================================== - -This notebook demonstrates how to use the `Async -API `__ -for asynchronous execution with OpenVINO. - -OpenVINO Runtime supports inference in either synchronous or -asynchronous mode. The key advantage of the Async API is that when a -device is busy with inference, the application can perform other tasks -in parallel (for example, populating inputs or scheduling other -requests) rather than wait for the current inference to complete first. - - -**Table of contents:** - - -- `Imports <#imports>`__ -- `Prepare model and data - processing <#prepare-model-and-data-processing>`__ - - - `Download test model <#download-test-model>`__ - - `Load the model <#load-the-model>`__ - - `Create functions for data - processing <#create-functions-for-data-processing>`__ - - `Get the test video <#get-the-test-video>`__ - -- `How to improve the throughput of video - processing <#how-to-improve-the-throughput-of-video-processing>`__ - - - `Sync Mode (default) <#sync-mode-default>`__ - - `Test performance in Sync Mode <#test-performance-in-sync-mode>`__ - - `Async Mode <#async-mode>`__ - - `Test the performance in Async - Mode <#test-the-performance-in-async-mode>`__ - - `Compare the performance <#compare-the-performance>`__ - -- `AsyncInferQueue <#asyncinferqueue>`__ - - - `Setting Callback <#setting-callback>`__ - - `Test the performance with - AsyncInferQueue <#test-the-performance-with-asyncinferqueue>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Imports -------- - - - -.. code:: ipython3 - - %pip install -q "openvino>=2023.1.0" - %pip install -q opencv-python tqdm "matplotlib>=3.4" - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - - -.. code:: ipython3 - - import cv2 - import time - import numpy as np - import openvino as ov - from IPython import display - import matplotlib.pyplot as plt - from pathlib import Path - - # Fetch the notebook utils script from the openvino_notebooks repo - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - import notebook_utils as utils - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("async-api.ipynb") - -Prepare model and data processing ---------------------------------- - - - -Download test model -~~~~~~~~~~~~~~~~~~~ - - - -We use a pre-trained model from OpenVINO’s `Open Model -Zoo `__ -to start the test. In this case, the model will be executed to detect -the person in each frame of the video. - -.. code:: ipython3 - - from pathlib import Path - - # directory where model will be downloaded - base_model_dir = "model" - - # model name as named in Open Model Zoo - model_name = "person-detection-0202" - precision = "FP16" - model_path = Path("model") / f"{model_name}.xml" - - base_model_url = "https://storage.openvinotoolkit.org/repositories/open_model_zoo/2023.0/models_bin/1" - - if not Path(model_path).exists(): - utils.download_file(f"{base_model_url}/{model_name}/{precision}/{model_name}.xml", filename=model_path.name, directory=model_path.parent) - utils.download_file( - f"{base_model_url}/{model_name}/{precision}/{model_name}.bin", filename=model_path.name.replace(".xml", ".bin"), directory=model_path.parent - ) - - - -.. parsed-literal:: - - person-detection-0202.xml: 0%| | 0.00/249k [00:00 0.5: - xmin = int(max((xmin * image.shape[1]), 10)) - ymin = int(max((ymin * image.shape[0]), 10)) - xmax = int(min((xmax * image.shape[1]), image.shape[1] - 10)) - ymax = int(min((ymax * image.shape[0]), image.shape[0] - 10)) - cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2) - cv2.putText( - image, - str(round(fps, 2)) + " fps", - (5, 20), - cv2.FONT_HERSHEY_SIMPLEX, - 0.7, - (0, 255, 0), - 3, - ) - return image - -Get the test video -~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - video_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/CEO%20Pat%20Gelsinger%20on%20Leading%20Intel.mp4" - video_path = Path("data/test_video.mp4") - if not video_path.exists(): - utils.download_file(video_url, video_path.name, video_path.parent) - - - -.. parsed-literal:: - - test_video.mp4: 0%| | 0.00/1.55M [00:00`__ -wrapper class. This class automatically spawns the pool of -``InferRequest`` objects (also called “jobs”) and provides -synchronization mechanisms to control the flow of the pipeline. It is a -simpler way to manage the infer request queue in Asynchronous mode. - -Setting Callback -~~~~~~~~~~~~~~~~ - - - -When ``callback`` is set, any job that ends inference calls upon the -Python function. The ``callback`` function must have two arguments: one -is the request that calls the ``callback``, which provides the -``InferRequest`` API; the other is called “user data”, which provides -the possibility of passing runtime values. - -.. code:: ipython3 - - def callback(infer_request, info) -> None: - """ - Define the callback function for postprocessing - - :param: infer_request: the infer_request object - info: a tuple includes original frame and starts time - :returns: - None - """ - global frame_number - global total_time - global inferqueue_fps - stop_time = time.time() - frame, start_time = info - total_time = stop_time - start_time - frame_number = frame_number + 1 - inferqueue_fps = frame_number / total_time - - res = infer_request.get_output_tensor(0).data[0] - frame = postprocess(res, frame, inferqueue_fps) - # Encode numpy array to jpg - _, encoded_img = cv2.imencode(".jpg", frame, params=[cv2.IMWRITE_JPEG_QUALITY, 90]) - # Create IPython image - i = display.Image(data=encoded_img) - # Display the image in this notebook - display.clear_output(wait=True) - display.display(i) - -.. code:: ipython3 - - def inferqueue(source, flip, fps, skip_first_frames) -> None: - """ - Define the main function for video processing with async infer queue - - :param: source: the video path or the ID of your webcam - :retuns: - None - """ - # Create infer requests queue - infer_queue = ov.AsyncInferQueue(compiled_model, 2) - infer_queue.set_callback(callback) - player = None - try: - # Create a video player - player = utils.VideoPlayer(source, flip=flip, fps=fps, skip_first_frames=skip_first_frames) - # Start capturing - start_time = time.time() - player.start() - while True: - # Capture frame - frame = player.next() - if frame is None: - print("Source ended") - break - resized_frame = preprocess(frame) - # Start the inference request with async infer queue - infer_queue.start_async({input_layer_ir.any_name: resized_frame}, (frame, start_time)) - except KeyboardInterrupt: - print("Interrupted") - # Any different error - except RuntimeError as e: - print(e) - finally: - infer_queue.wait_all() - player.stop() - -Test the performance with ``AsyncInferQueue`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - frame_number = 0 - total_time = 0 - inferqueue(source=video_path, flip=False, fps=30, skip_first_frames=800) - print(f"average throughput in async mode with async infer queue: {inferqueue_fps:.2f} fps") - - - -.. image:: async-api-with-output_files/async-api-with-output_29_0.png - - -.. parsed-literal:: - - average throughput in async mode with async infer queue: 146.24 fps - diff --git a/docs/notebooks/async-api-with-output_files/async-api-with-output_17_0.png b/docs/notebooks/async-api-with-output_files/async-api-with-output_17_0.png deleted file mode 100644 index 60045bb3758a43..00000000000000 --- a/docs/notebooks/async-api-with-output_files/async-api-with-output_17_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ad51650644959fe669787d637d49fe35cfc85c9c8f5637470403470f56f6abb -size 4307 diff --git a/docs/notebooks/async-api-with-output_files/async-api-with-output_21_0.png b/docs/notebooks/async-api-with-output_files/async-api-with-output_21_0.png deleted file mode 100644 index 60045bb3758a43..00000000000000 --- a/docs/notebooks/async-api-with-output_files/async-api-with-output_21_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ad51650644959fe669787d637d49fe35cfc85c9c8f5637470403470f56f6abb -size 4307 diff --git a/docs/notebooks/async-api-with-output_files/async-api-with-output_23_0.png b/docs/notebooks/async-api-with-output_files/async-api-with-output_23_0.png deleted file mode 100644 index 951a905665c74e..00000000000000 --- a/docs/notebooks/async-api-with-output_files/async-api-with-output_23_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0f02ccae03ea02880418076118715700e9b455e5a61e2f00bf47b9cbf4f0ed8b -size 29467 diff --git a/docs/notebooks/async-api-with-output_files/async-api-with-output_29_0.png b/docs/notebooks/async-api-with-output_files/async-api-with-output_29_0.png deleted file mode 100644 index 60045bb3758a43..00000000000000 --- a/docs/notebooks/async-api-with-output_files/async-api-with-output_29_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ad51650644959fe669787d637d49fe35cfc85c9c8f5637470403470f56f6abb -size 4307 diff --git a/docs/notebooks/auto-device-with-output.rst b/docs/notebooks/auto-device-with-output.rst deleted file mode 100644 index 05194a3905ffcb..00000000000000 --- a/docs/notebooks/auto-device-with-output.rst +++ /dev/null @@ -1,707 +0,0 @@ -Automatic Device Selection with OpenVINO™ -========================================= - -The `Auto -device `__ -(or AUTO in short) selects the most suitable device for inference by -considering the model precision, power efficiency and processing -capability of the available `compute -devices `__. -The model precision (such as ``FP32``, ``FP16``, ``INT8``, etc.) is the -first consideration to filter out the devices that cannot run the -network efficiently. - -Next, if dedicated accelerators are available, these devices are -preferred (for example, integrated and discrete -`GPU `__). -`CPU `__ -is used as the default “fallback device”. Keep in mind that AUTO makes -this selection only once, during the loading of a model. - -When using accelerator devices such as GPUs, loading models to these -devices may take a long time. To address this challenge for applications -that require fast first inference response, AUTO starts inference -immediately on the CPU and then transparently shifts inference to the -GPU, once it is ready. This dramatically reduces the time to execute -first inference. - -.. figure:: https://user-images.githubusercontent.com/15709723/161451847-759e2bdb-70bc-463d-9818-400c0ccf3c16.png - :alt: auto - - auto - - -**Table of contents:** - - -- `Import modules and create Core <#import-modules-and-create-core>`__ -- `Convert the model to OpenVINO IR - format <#convert-the-model-to-openvino-ir-format>`__ -- `(1) Simplify selection logic <#1-simplify-selection-logic>`__ - - - `Default behavior of Core::compile_model API without - device_name <#default-behavior-of-corecompile_model-api-without-device_name>`__ - - `Explicitly pass AUTO as device_name to Core::compile_model - API <#explicitly-pass-auto-as-device_name-to-corecompile_model-api>`__ - -- `(2) Improve the first inference - latency <#2-improve-the-first-inference-latency>`__ - - - `Load an Image <#load-an-image>`__ - - `Load the model to GPU device and perform - inference <#load-the-model-to-gpu-device-and-perform-inference>`__ - - `Load the model using AUTO device and do - inference <#load-the-model-using-auto-device-and-do-inference>`__ - -- `(3) Achieve different performance for different - targets <#3-achieve-different-performance-for-different-targets>`__ - - - `Class and callback definition <#class-and-callback-definition>`__ - - `Inference with THROUGHPUT - hint <#inference-with-throughput-hint>`__ - - `Inference with LATENCY hint <#inference-with-latency-hint>`__ - - `Difference in FPS and latency <#difference-in-fps-and-latency>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Import modules and create Core ------------------------------- - - - -.. code:: ipython3 - - import platform - - # Install required packages - %pip install -q "openvino>=2023.1.0" "matplotlib>=3.4" Pillow torch torchvision tqdm --extra-index-url https://download.pytorch.org/whl/cpu - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - - -.. code:: ipython3 - - import time - import sys - - import openvino as ov - - from IPython.display import Markdown, display - - core = ov.Core() - - if not any("GPU" in device for device in core.available_devices): - display( - Markdown( - '
Warning: A GPU device is not available. This notebook requires GPU device to have meaningful results.
' - ) - ) - - - -.. warning:: - - Warning: A GPU device is not available. This notebook requires GPU - device to have meaningful results. - - -Convert the model to OpenVINO IR format ---------------------------------------- - - - -This tutorial uses -`resnet50 `__ -model from -`torchvision `__ -library. ResNet 50 is image classification model pre-trained on ImageNet -dataset described in paper `“Deep Residual Learning for Image -Recognition” `__. From OpenVINO -2023.0, we can directly convert a model from the PyTorch format to the -OpenVINO IR format using model conversion API. To convert model, we -should provide model object instance into ``ov.convert_model`` function, -optionally, we can specify input shape for conversion (by default models -from PyTorch converted with dynamic input shapes). ``ov.convert_model`` -returns openvino.runtime.Model object ready to be loaded on a device -with ``ov.compile_model`` or serialized for next usage with -``ov.save_model``. - -For more information about model conversion API, see this -`page `__. - -.. code:: ipython3 - - import torchvision - from pathlib import Path - - base_model_dir = Path("./model") - base_model_dir.mkdir(exist_ok=True) - model_path = base_model_dir / "resnet50.xml" - - if not model_path.exists(): - pt_model = torchvision.models.resnet50(weights="DEFAULT") - ov_model = ov.convert_model(pt_model, input=[[1, 3, 224, 224]]) - ov.save_model(ov_model, str(model_path)) - print("IR model saved to {}".format(model_path)) - else: - print("Read IR model from {}".format(model_path)) - ov_model = core.read_model(model_path) - - -.. parsed-literal:: - - IR model saved to model/resnet50.xml - - -(1) Simplify selection logic ----------------------------- - - - -Default behavior of Core::compile_model API without device_name -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -By default, ``compile_model`` API will select **AUTO** as -``device_name`` if no device is specified. - -.. code:: ipython3 - - import openvino.properties.log as log - - - # Set LOG_LEVEL to LOG_INFO. - core.set_property("AUTO", {log.level(): log.Level.INFO}) - - # Load the model onto the target device. - compiled_model = core.compile_model(ov_model) - - if isinstance(compiled_model, ov.CompiledModel): - print("Successfully compiled model without a device_name.") - - -.. parsed-literal:: - - [23:05:31.1930]I[plugin.cpp:421][AUTO] device:CPU, config:LOG_LEVEL=LOG_INFO - [23:05:31.1930]I[plugin.cpp:421][AUTO] device:CPU, config:PERFORMANCE_HINT=LATENCY - [23:05:31.1931]I[plugin.cpp:421][AUTO] device:CPU, config:PERFORMANCE_HINT_NUM_REQUESTS=0 - [23:05:31.1931]I[plugin.cpp:421][AUTO] device:CPU, config:PERF_COUNT=NO - [23:05:31.1931]I[plugin.cpp:426][AUTO] device:CPU, priority:0 - [23:05:31.1931]I[schedule.cpp:17][AUTO] scheduler starting - [23:05:31.1931]I[auto_schedule.cpp:181][AUTO] select device:CPU - [23:05:31.3235]I[auto_schedule.cpp:346][AUTO] Device: [CPU]: Compile model took 130.407415 ms - [23:05:31.3237]I[auto_schedule.cpp:112][AUTO] device:CPU compiling model finished - [23:05:31.3238]I[plugin.cpp:454][AUTO] underlying hardware does not support hardware context - Successfully compiled model without a device_name. - - -.. code:: ipython3 - - # Deleted model will wait until compiling on the selected device is complete. - del compiled_model - print("Deleted compiled_model") - - -.. parsed-literal:: - - Deleted compiled_model - [23:05:31.3290]I[schedule.cpp:308][AUTO] scheduler ending - - -Explicitly pass AUTO as device_name to Core::compile_model API -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -It is optional, but passing AUTO explicitly as ``device_name`` may -improve readability of your code. - -.. code:: ipython3 - - # Set LOG_LEVEL to LOG_NONE. - core.set_property("AUTO", {log.level(): log.Level.NO}) - - compiled_model = core.compile_model(model=ov_model, device_name="AUTO") - - if isinstance(compiled_model, ov.CompiledModel): - print("Successfully compiled model using AUTO.") - - -.. parsed-literal:: - - Successfully compiled model using AUTO. - - -.. code:: ipython3 - - # Deleted model will wait until compiling on the selected device is complete. - del compiled_model - print("Deleted compiled_model") - - -.. parsed-literal:: - - Deleted compiled_model - - -(2) Improve the first inference latency ---------------------------------------- - - - -One of the benefits of using AUTO device selection is reducing FIL -(first inference latency). FIL is the model compilation time combined -with the first inference execution time. Using the CPU device explicitly -will produce the shortest first inference latency, as the OpenVINO graph -representation loads quickly on CPU, using just-in-time (JIT) -compilation. The challenge is with GPU devices since OpenCL graph -complication to GPU-optimized kernels takes a few seconds to complete. -This initialization time may be intolerable for some applications. To -avoid this delay, the AUTO uses CPU transparently as the first inference -device until GPU is ready. - -Load an Image -~~~~~~~~~~~~~ - - - -torchvision library provides model specific input transformation -function, we will reuse it for preparing input data. - -.. code:: ipython3 - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import download_file - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("auto-device.ipynb") - -.. code:: ipython3 - - from PIL import Image - - image_filename = Path("data/coco.jpg") - - if not image_filename.exists(): - # Download the image from the openvino_notebooks storage - image_filename = download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco.jpg", - directory="data", - ) - - image = Image.open(str(image_filename)) - input_transform = torchvision.models.ResNet50_Weights.DEFAULT.transforms() - - input_tensor = input_transform(image) - input_tensor = input_tensor.unsqueeze(0).numpy() - image - - - -.. parsed-literal:: - - coco.jpg: 0%| | 0.00/202k [00:00`__ -section of `Automatic Device -Selection `__ -article. - -Class and callback definition -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - class PerformanceMetrics: - """ - Record the latest performance metrics (fps and latency), update the metrics in each @interval seconds - :member: fps: Frames per second, indicates the average number of inferences executed each second during the last @interval seconds. - :member: latency: Average latency of inferences executed in the last @interval seconds. - :member: start_time: Record the start timestamp of onging @interval seconds duration. - :member: latency_list: Record the latency of each inference execution over @interval seconds duration. - :member: interval: The metrics will be updated every @interval seconds - """ - - def __init__(self, interval): - """ - Create and initilize one instance of class PerformanceMetrics. - :param: interval: The metrics will be updated every @interval seconds - :returns: - Instance of PerformanceMetrics - """ - self.fps = 0 - self.latency = 0 - - self.start_time = time.perf_counter() - self.latency_list = [] - self.interval = interval - - def update(self, infer_request: ov.InferRequest) -> bool: - """ - Update the metrics if current ongoing @interval seconds duration is expired. Record the latency only if it is not expired. - :param: infer_request: InferRequest returned from inference callback, which includes the result of inference request. - :returns: - True, if metrics are updated. - False, if @interval seconds duration is not expired and metrics are not updated. - """ - self.latency_list.append(infer_request.latency) - exec_time = time.perf_counter() - self.start_time - if exec_time >= self.interval: - # Update the performance metrics. - self.start_time = time.perf_counter() - self.fps = len(self.latency_list) / exec_time - self.latency = sum(self.latency_list) / len(self.latency_list) - print(f"throughput: {self.fps: .2f}fps, latency: {self.latency: .2f}ms, time interval:{exec_time: .2f}s") - sys.stdout.flush() - self.latency_list = [] - return True - else: - return False - - - class InferContext: - """ - Inference context. Record and update peforamnce metrics via @metrics, set @feed_inference to False once @remaining_update_num <=0 - :member: metrics: instance of class PerformanceMetrics - :member: remaining_update_num: the remaining times for peforamnce metrics updating. - :member: feed_inference: if feed inference request is required or not. - """ - - def __init__(self, update_interval, num): - """ - Create and initilize one instance of class InferContext. - :param: update_interval: The performance metrics will be updated every @update_interval seconds. This parameter will be passed to class PerformanceMetrics directly. - :param: num: The number of times performance metrics are updated. - :returns: - Instance of InferContext. - """ - self.metrics = PerformanceMetrics(update_interval) - self.remaining_update_num = num - self.feed_inference = True - - def update(self, infer_request: ov.InferRequest): - """ - Update the context. Set @feed_inference to False if the number of remaining performance metric updates (@remaining_update_num) reaches 0 - :param: infer_request: InferRequest returned from inference callback, which includes the result of inference request. - :returns: None - """ - if self.remaining_update_num <= 0: - self.feed_inference = False - - if self.metrics.update(infer_request): - self.remaining_update_num = self.remaining_update_num - 1 - if self.remaining_update_num <= 0: - self.feed_inference = False - - - def completion_callback(infer_request: ov.InferRequest, context) -> None: - """ - callback for the inference request, pass the @infer_request to @context for updating - :param: infer_request: InferRequest returned for the callback, which includes the result of inference request. - :param: context: user data which is passed as the second parameter to AsyncInferQueue:start_async() - :returns: None - """ - context.update(infer_request) - - - # Performance metrics update interval (seconds) and number of times. - metrics_update_interval = 10 - metrics_update_num = 6 - -Inference with THROUGHPUT hint -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Loop for inference and update the FPS/Latency every -@metrics_update_interval seconds. - -.. code:: ipython3 - - import openvino.properties.hint as hints - - - THROUGHPUT_hint_context = InferContext(metrics_update_interval, metrics_update_num) - - print("Compiling Model for AUTO device with THROUGHPUT hint") - sys.stdout.flush() - - compiled_model = core.compile_model(model=ov_model, config={hints.performance_mode(): hints.PerformanceMode.THROUGHPUT}) - - infer_queue = ov.AsyncInferQueue(compiled_model, 0) # Setting to 0 will query optimal number by default. - infer_queue.set_callback(completion_callback) - - print(f"Start inference, {metrics_update_num: .0f} groups of FPS/latency will be measured over {metrics_update_interval: .0f}s intervals") - sys.stdout.flush() - - while THROUGHPUT_hint_context.feed_inference: - infer_queue.start_async(input_tensor, THROUGHPUT_hint_context) - - infer_queue.wait_all() - - # Take the FPS and latency of the latest period. - THROUGHPUT_hint_fps = THROUGHPUT_hint_context.metrics.fps - THROUGHPUT_hint_latency = THROUGHPUT_hint_context.metrics.latency - - print("Done") - - del compiled_model - - -.. parsed-literal:: - - Compiling Model for AUTO device with THROUGHPUT hint - Start inference, 6 groups of FPS/latency will be measured over 10s intervals - throughput: 183.01fps, latency: 31.42ms, time interval: 10.01s - throughput: 181.78fps, latency: 32.22ms, time interval: 10.02s - throughput: 181.19fps, latency: 32.39ms, time interval: 10.01s - throughput: 180.90fps, latency: 32.34ms, time interval: 10.01s - throughput: 182.92fps, latency: 32.06ms, time interval: 10.01s - throughput: 183.14fps, latency: 31.93ms, time interval: 10.01s - Done - - -Inference with LATENCY hint -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Loop for inference and update the FPS/Latency for each -@metrics_update_interval seconds - -.. code:: ipython3 - - LATENCY_hint_context = InferContext(metrics_update_interval, metrics_update_num) - - print("Compiling Model for AUTO Device with LATENCY hint") - sys.stdout.flush() - - compiled_model = core.compile_model(model=ov_model, config={hints.performance_mode(): hints.PerformanceMode.LATENCY}) - - # Setting to 0 will query optimal number by default. - infer_queue = ov.AsyncInferQueue(compiled_model, 0) - infer_queue.set_callback(completion_callback) - - print(f"Start inference, {metrics_update_num: .0f} groups fps/latency will be out with {metrics_update_interval: .0f}s interval") - sys.stdout.flush() - - while LATENCY_hint_context.feed_inference: - infer_queue.start_async(input_tensor, LATENCY_hint_context) - - infer_queue.wait_all() - - # Take the FPS and latency of the latest period. - LATENCY_hint_fps = LATENCY_hint_context.metrics.fps - LATENCY_hint_latency = LATENCY_hint_context.metrics.latency - - print("Done") - - del compiled_model - - -.. parsed-literal:: - - Compiling Model for AUTO Device with LATENCY hint - Start inference, 6 groups fps/latency will be out with 10s interval - throughput: 140.07fps, latency: 6.62ms, time interval: 10.00s - throughput: 142.55fps, latency: 6.62ms, time interval: 10.00s - throughput: 138.69fps, latency: 6.83ms, time interval: 10.01s - throughput: 138.53fps, latency: 6.83ms, time interval: 10.01s - throughput: 142.77fps, latency: 6.63ms, time interval: 10.00s - throughput: 142.70fps, latency: 6.63ms, time interval: 10.01s - Done - - -Difference in FPS and latency -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - import matplotlib.pyplot as plt - - TPUT = 0 - LAT = 1 - labels = ["THROUGHPUT hint", "LATENCY hint"] - - fig1, ax1 = plt.subplots(1, 1) - fig1.patch.set_visible(False) - ax1.axis("tight") - ax1.axis("off") - - cell_text = [] - cell_text.append( - [ - "%.2f%s" % (THROUGHPUT_hint_fps, " FPS"), - "%.2f%s" % (THROUGHPUT_hint_latency, " ms"), - ] - ) - cell_text.append(["%.2f%s" % (LATENCY_hint_fps, " FPS"), "%.2f%s" % (LATENCY_hint_latency, " ms")]) - - table = ax1.table( - cellText=cell_text, - colLabels=["FPS (Higher is better)", "Latency (Lower is better)"], - rowLabels=labels, - rowColours=["deepskyblue"] * 2, - colColours=["deepskyblue"] * 2, - cellLoc="center", - loc="upper left", - ) - table.auto_set_font_size(False) - table.set_fontsize(18) - table.auto_set_column_width(0) - table.auto_set_column_width(1) - table.scale(1, 3) - - fig1.tight_layout() - plt.show() - - - -.. image:: auto-device-with-output_files/auto-device-with-output_27_0.png - - -.. code:: ipython3 - - # Output the difference. - width = 0.4 - fontsize = 14 - - plt.rc("font", size=fontsize) - fig, ax = plt.subplots(1, 2, figsize=(10, 8)) - - rects1 = ax[0].bar([0], THROUGHPUT_hint_fps, width, label=labels[TPUT], color="#557f2d") - rects2 = ax[0].bar([width], LATENCY_hint_fps, width, label=labels[LAT]) - ax[0].set_ylabel("frames per second") - ax[0].set_xticks([width / 2]) - ax[0].set_xticklabels(["FPS"]) - ax[0].set_xlabel("Higher is better") - - rects1 = ax[1].bar([0], THROUGHPUT_hint_latency, width, label=labels[TPUT], color="#557f2d") - rects2 = ax[1].bar([width], LATENCY_hint_latency, width, label=labels[LAT]) - ax[1].set_ylabel("milliseconds") - ax[1].set_xticks([width / 2]) - ax[1].set_xticklabels(["Latency (ms)"]) - ax[1].set_xlabel("Lower is better") - - fig.suptitle("Performance Hints") - fig.legend(labels, fontsize=fontsize) - fig.tight_layout() - - plt.show() - - - -.. image:: auto-device-with-output_files/auto-device-with-output_28_0.png - diff --git a/docs/notebooks/auto-device-with-output_files/auto-device-with-output_14_1.jpg b/docs/notebooks/auto-device-with-output_files/auto-device-with-output_14_1.jpg deleted file mode 100644 index abe8cb45ca2fd7..00000000000000 --- a/docs/notebooks/auto-device-with-output_files/auto-device-with-output_14_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5a58493845ebd3e98186df7a1ea042b20545bb3c2a5b4a326163da6c9eb5e7d9 -size 121563 diff --git a/docs/notebooks/auto-device-with-output_files/auto-device-with-output_14_1.png b/docs/notebooks/auto-device-with-output_files/auto-device-with-output_14_1.png deleted file mode 100644 index 30f6673bf72850..00000000000000 --- a/docs/notebooks/auto-device-with-output_files/auto-device-with-output_14_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:84112f4f04be0a3445174f4dcf7e300fc12dbd50fd8a5e2d98b4082402dda6de -size 869661 diff --git a/docs/notebooks/auto-device-with-output_files/auto-device-with-output_27_0.png b/docs/notebooks/auto-device-with-output_files/auto-device-with-output_27_0.png deleted file mode 100644 index 15ebd7f4962d2e..00000000000000 --- a/docs/notebooks/auto-device-with-output_files/auto-device-with-output_27_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:546abc40ef39108cca926590b41f9a5c43b279ac1ed1a5ab0f36e83b34ce4757 -size 26677 diff --git a/docs/notebooks/auto-device-with-output_files/auto-device-with-output_28_0.png b/docs/notebooks/auto-device-with-output_files/auto-device-with-output_28_0.png deleted file mode 100644 index 88e950c0f8af8b..00000000000000 --- a/docs/notebooks/auto-device-with-output_files/auto-device-with-output_28_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2b05dc6945984939ff41d828ff2a0f9e57b2d7c13914c288e519f600e27c4f87 -size 40020 diff --git a/docs/notebooks/bark-text-to-audio-with-output.rst b/docs/notebooks/bark-text-to-audio-with-output.rst deleted file mode 100644 index 87c58063117301..00000000000000 --- a/docs/notebooks/bark-text-to-audio-with-output.rst +++ /dev/null @@ -1,1043 +0,0 @@ -Text-to-speech generation using Bark and OpenVINO -================================================= - -🐶 Bark is a transformer-based text-to-audio model created by -`Suno `__. Bark can generate highly realistic, -multilingual speech as well as other audio - including music, background -noise and simple sound effects. The model can also produce nonverbal -communications like laughing, sighing and crying. - -With Bark, users can also produce nonverbal communications like -laughing, sighing, and crying, making it a versatile tool for a variety -of applications. - -.. figure:: https://user-images.githubusercontent.com/5068315/235310676-a4b3b511-90ec-4edf-8153-7ccf14905d73.png - :alt: image.png - - image.png - -Bark is a cutting-edge text-to-speech (TTS) technology that has taken -the AI world by storm. Unlike the typical TTS engines that sound robotic -and mechanic, Bark offers human-like voices that are highly realistic -and natural sounding. Bark uses GPT-style models to generate speech with -minimal tweaking, producing highly expressive and emotive voices that -can capture nuances such as tone, pitch, and rhythm. It offers a -fantastic experience that can leave you wondering if you’re listening to -human beings. - -Notably, Bark supports multiple languages and can generate speech in -Mandarin, French, Italian, Spanish, and other languages with impressive -clarity and accuracy. With Bark, you can easily switch between languages -and still enjoy high-quality sound effects. - -Bark is not only intelligent but also intuitive, making it an ideal tool -for individuals and businesses looking to create high-quality voice -content for their platforms. Whether you’re looking to create podcasts, -audiobooks, video game sounds, or any other form of voice content, Bark -has you covered. - -So, if you’re looking for a revolutionary text-to-speech technology that -can elevate your voice content, Bark is the way to go! In this tutorial -we consider how to convert and run bark with OpenVINO. - -About model ------------ - -Bark uses GPT-style models to generate audio from scratch, but the -initial text prompt is embedded into high-level semantic tokens without -the use of phonemes. This allows Bark to generalize to arbitrary -instructions beyond speech that occur in the training data, such as -music lyrics, sound effects, or other non-speech sounds. - -A subsequent second model is used to convert the generated semantic -tokens into audio codec tokens to generate the full waveform. To enable -the community to use Bark via public code, EnCodec codec from Facebook -is used to act as an audio representation. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Download and Convert models <#download-and-convert-models>`__ - - - `Text Encoder <#text-encoder>`__ - - `Coarse encoder <#coarse-encoder>`__ - - `Fine encoder <#fine-encoder>`__ - - `Prepare Inference pipeline <#prepare-inference-pipeline>`__ - -- `Run model inference <#run-model-inference>`__ - - - `Select Inference device <#select-inference-device>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "torch" "torchvision" "torchaudio" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "openvino>=2023.1.0" "gradio>=4.19" - %pip install -q "git+https://github.com/suno-ai/bark.git" --extra-index-url https://download.pytorch.org/whl/cpu - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("bark-text-to-audio.ipynb") - -Download and Convert models ---------------------------- - - - -.. code:: ipython3 - - from pathlib import Path - from bark.generation import load_model, codec_decode, _flatten_codebooks - - models_dir = Path("models") - models_dir.mkdir(exist_ok=True) - -Text Encoder -~~~~~~~~~~~~ - - - -Text encoder is responsible for embedding initial text prompt into -high-level semantic tokens. it uses tokenizer for conversion input text -to token ids and predicts semantic text tokens that capture the meaning -of the text. There are some differences between text encoder behavior on -first step and others. It is the reason why we need to use separated -models for that. - -.. code:: ipython3 - - text_use_small = True - - text_encoder = load_model(model_type="text", use_gpu=False, use_small=text_use_small, force_reload=False, weights_only=False) - - text_encoder_model = text_encoder["model"] - tokenizer = text_encoder["tokenizer"] - -.. code:: ipython3 - - import torch - import openvino as ov - - text_model_suffix = "_small" if text_use_small else "" - text_model_dir = models_dir / f"text_encoder{text_model_suffix}" - text_model_dir.mkdir(exist_ok=True) - text_encoder_path1 = text_model_dir / "bark_text_encoder_1.xml" - text_encoder_path0 = text_model_dir / "bark_text_encoder_0.xml" - -.. code:: ipython3 - - class TextEncoderModel(torch.nn.Module): - def __init__(self, encoder): - super().__init__() - self.encoder = encoder - - def forward(self, idx, past_kv=None): - return self.encoder(idx, merge_context=True, past_kv=past_kv, use_cache=True) - - - if not text_encoder_path0.exists() or not text_encoder_path1.exists(): - text_encoder_exportable = TextEncoderModel(text_encoder_model) - ov_model = ov.convert_model(text_encoder_exportable, example_input=torch.ones((1, 513), dtype=torch.int64)) - ov.save_model(ov_model, text_encoder_path0) - logits, kv_cache = text_encoder_exportable(torch.ones((1, 513), dtype=torch.int64)) - ov_model = ov.convert_model( - text_encoder_exportable, - example_input=(torch.ones((1, 1), dtype=torch.int64), kv_cache), - ) - ov.save_model(ov_model, text_encoder_path1) - del ov_model - del text_encoder_exportable - del text_encoder_model, text_encoder - - -.. parsed-literal:: - - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/bark/model.py:176: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert(idx.shape[1] >= 256+256+1) - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/bark/model.py:199: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert position_ids.shape == (1, t) - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/bark/model.py:172: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert t == 1 - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/torch/jit/_trace.py:160: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at aten/src/ATen/core/TensorBody.h:489.) - if a.grad is not None: - - -Coarse encoder -~~~~~~~~~~~~~~ - - - -Coarse encoder is a causal autoregressive transformer, that takes as -input the results of the text encoder model. It aims at predicting the -first two audio codebooks necessary for EnCodec. Coarse encoder is -autoregressive model, it means that for making prediction on next step, -it uses own output from previous step. For reducing model complexity and -optimization, caching key and values for attention blocks can be used. -past_key_values contains set of precomputed attention keys and values -for each attention module in the model from previous step as they will -be not changed from step to step and allow us calculate only update for -the current step and join to previous. For avoiding to have separated -model for first inference, where model does not have “past”, we will -provide empty tensor on the first step. - -.. code:: ipython3 - - coarse_use_small = True - - coarse_model = load_model( - model_type="coarse", - use_gpu=False, - use_small=coarse_use_small, - force_reload=False, - ) - - coarse_model_suffix = "_small" if coarse_use_small else "" - coarse_model_dir = models_dir / f"coarse{coarse_model_suffix}" - coarse_model_dir.mkdir(exist_ok=True) - coarse_encoder_path = coarse_model_dir / "bark_coarse_encoder.xml" - -.. code:: ipython3 - - import types - - - class CoarseEncoderModel(torch.nn.Module): - def __init__(self, encoder): - super().__init__() - self.encoder = encoder - - def forward(self, idx, past_kv=None): - return self.encoder(idx, past_kv=past_kv, use_cache=True) - - - def casual_self_attention_forward(self, x, past_kv=None, use_cache=False): - B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd) - - # calculate query, key, values for all heads in batch and move head forward to be the batch dim - q, k, v = self.c_attn(x).split(self.n_embd, dim=2) - k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) - q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) - v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) - past_len = 0 - - if past_kv is not None: - past_key = past_kv[0] - past_value = past_kv[1] - k = torch.cat((past_key, k), dim=-2) - v = torch.cat((past_value, v), dim=-2) - past_len = past_key.shape[-2] - - FULL_T = k.shape[-2] - - if use_cache is True: - present = (k, v) - else: - present = None - - # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T) - if self.flash: - # efficient attention using Flash Attention - full_attention_mask = torch.ones( - B, - T, - T, - device=x.device, - dtype=torch.float, - ) * float("-inf") - full_attention_mask.triu_(diagonal=1) - if past_len > 0: - full_attention_mask = torch.cat( - ( - torch.zeros(B, T, past_len, device=x.device), - full_attention_mask, - ), - dim=-1, - ) - - y = torch.nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout, attn_mask=full_attention_mask) - else: - # manual implementation of attention - att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) - att = att.masked_fill(self.bias[:, :, FULL_T - T : FULL_T, :FULL_T] == 0, float("-inf")) - att = F.softmax(att, dim=-1) - att = self.attn_dropout(att) - y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) - y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side - - # output projection - y = self.resid_dropout(self.c_proj(y)) - return (y, present) - - - if not coarse_encoder_path.exists(): - coarse_encoder_exportable = CoarseEncoderModel(coarse_model) - for block in coarse_encoder_exportable.encoder.transformer.h: - block.attn.forward = types.MethodType(casual_self_attention_forward, block.attn) - logits, kv_cache = coarse_encoder_exportable(torch.ones((1, 886), dtype=torch.int64)) - ov_model = ov.convert_model( - coarse_encoder_exportable, - example_input=(torch.ones((1, 1), dtype=torch.int64), kv_cache), - ) - ov.save_model(ov_model, coarse_encoder_path) - del ov_model - del coarse_encoder_exportable - del coarse_model - - -.. parsed-literal:: - - /tmp/ipykernel_1046533/1432960018.py:47: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if past_len > 0: - - -.. code:: ipython3 - - fine_use_small = False - - fine_model = load_model(model_type="fine", use_gpu=False, use_small=fine_use_small, force_reload=False) - - fine_model_suffix = "_small" if fine_use_small else "" - fine_model_dir = models_dir / f"fine_model{fine_model_suffix}" - fine_model_dir.mkdir(exist_ok=True) - -.. code:: ipython3 - - class FineModel(torch.nn.Module): - def __init__(self, model): - super().__init__() - self.model = model - - def forward(self, pred_idx, idx): - b, t, codes = idx.size() - pos = torch.arange(0, t, dtype=torch.long).unsqueeze(0) # shape (1, t) - - # forward the GPT model itself - tok_embs = [wte(idx[:, :, i]).unsqueeze(-1) for i, wte in enumerate(self.model.transformer.wtes)] # token embeddings of shape (b, t, n_embd) - tok_emb = torch.cat(tok_embs, dim=-1) - pos_emb = self.model.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd) - x = tok_emb[:, :, :, : pred_idx + 1].sum(dim=-1) - x = self.model.transformer.drop(x + pos_emb) - for block in self.model.transformer.h: - x = block(x) - x = self.model.transformer.ln_f(x) - return x - - - fine_feature_extractor_path = fine_model_dir / "bark_fine_feature_extractor.xml" - -Fine encoder -~~~~~~~~~~~~ - - - -Fine encoder is time a non-causal autoencoder transformer, which -iteratively predicts the last codebooks based on the sum of the previous -codebooks embeddings obtained using Coarse encoder. - -.. code:: ipython3 - - if not fine_feature_extractor_path.exists(): - lm_heads = fine_model.lm_heads - fine_feature_extractor = FineModel(fine_model) - feature_extractor_out = fine_feature_extractor(3, torch.zeros((1, 1024, 8), dtype=torch.int32)) - ov_model = ov.convert_model( - fine_feature_extractor, - example_input=( - torch.ones(1, dtype=torch.long), - torch.zeros((1, 1024, 8), dtype=torch.long), - ), - ) - ov.save_model(ov_model, fine_feature_extractor_path) - for i, lm_head in enumerate(lm_heads): - ov.save_model( - ov.convert_model(lm_head, example_input=feature_extractor_out), - fine_model_dir / f"bark_fine_lm_{i}.xml", - ) - -Prepare Inference pipeline -~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -For better usability, classes for working with models provided below. - -.. code:: ipython3 - - class OVBarkTextEncoder: - def __init__(self, core, device, model_path1, model_path2): - self.compiled_model1 = core.compile_model(model_path1, device) - self.compiled_model2 = core.compile_model(model_path2, device) - - def __call__(self, input_ids, past_kv=None): - if past_kv is None: - outputs = self.compiled_model1(input_ids, share_outputs=True) - else: - outputs = self.compiled_model2([input_ids, *past_kv], share_outputs=True) - logits, kv_cache = self.postprocess_outputs(outputs, past_kv is None) - return logits, kv_cache - - def postprocess_outputs(self, outs, is_first_stage): - net_outs = self.compiled_model1.outputs if is_first_stage else self.compiled_model2.outputs - logits = outs[net_outs[0]] - kv_cache = [] - for out_tensor in net_outs[1:]: - kv_cache.append(outs[out_tensor]) - return logits, kv_cache - - - class OVBarkEncoder: - def __init__(self, core, device, model_path): - self.compiled_model = core.compile_model(model_path, device) - - def __call__(self, idx, past_kv=None): - if past_kv is None: - past_kv = self._init_past_kv() - outs = self.compiled_model([idx, *past_kv], share_outputs=True) - return self.postprocess_outputs(outs) - - def postprocess_outputs(self, outs): - net_outs = self.compiled_model.outputs - logits = outs[net_outs[0]] - kv_cache = [] - for out_tensor in net_outs[1:]: - kv_cache.append(outs[out_tensor]) - return logits, kv_cache - - def _init_past_kv(self): - inputs = [] - for input_t in self.compiled_model.inputs[1:]: - input_shape = input_t.partial_shape - input_shape[0] = 1 - input_shape[2] = 0 - inputs.append(ov.Tensor(ov.Type.f32, input_shape.get_shape())) - return inputs - - - class OVBarkFineEncoder: - def __init__(self, core, device, model_dir, num_lm_heads=7): - self.feats_compiled_model = core.compile_model(model_dir / "bark_fine_feature_extractor.xml", device) - self.feats_out = self.feats_compiled_model.output(0) - lm_heads = [] - for i in range(num_lm_heads): - lm_heads.append(core.compile_model(model_dir / f"bark_fine_lm_{i}.xml", device)) - self.lm_heads = lm_heads - - def __call__(self, pred_idx, idx): - feats = self.feats_compiled_model([ov.Tensor(pred_idx), ov.Tensor(idx)])[self.feats_out] - lm_id = pred_idx - 1 - logits = self.lm_heads[int(lm_id)](feats)[0] - return logits - -``generate_audio`` function is the main function for starting audio -generation process. It accepts input text and optionally history prompt, -provided by user and run inference pipeline. The inference pipeline -consists from several steps, illustrated on the diagram below: - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/3a272a34-50bc-4d4a-bb1f-8a649cbf1d6d - :alt: bark_pipeline - - bark_pipeline - -1. Generation semantic tokens from input text using Text Encoder -2. Generation coarse acoustic codebooks from semantic tokens using - Coarse Encoder -3. Generation fine acoustic codebooks from coarse codebooks using Fine - Encoder -4. Decode codebooks to audio waveform - -.. code:: ipython3 - - from typing import Optional, Union, Dict - import tqdm - import numpy as np - - - def generate_audio( - text: str, - history_prompt: Optional[Union[Dict, str]] = None, - text_temp: float = 0.7, - waveform_temp: float = 0.7, - silent: bool = False, - ): - """Generate audio array from input text. - - Args: - text: text to be turned into audio - history_prompt: history choice for audio cloning - text_temp: generation temperature (1.0 more diverse, 0.0 more conservative) - waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative) - silent: disable progress bar - - Returns: - numpy audio array at sample frequency 24khz - """ - semantic_tokens = text_to_semantic( - text, - history_prompt=history_prompt, - temp=text_temp, - silent=silent, - ) - out = semantic_to_waveform( - semantic_tokens, - history_prompt=history_prompt, - temp=waveform_temp, - silent=silent, - ) - return out - -.. code:: ipython3 - - def text_to_semantic( - text: str, - history_prompt: Optional[Union[Dict, str]] = None, - temp: float = 0.7, - silent: bool = False, - ): - """Generate semantic array from text. - - Args: - text: text to be turned into audio - history_prompt: history choice for audio cloning - temp: generation temperature (1.0 more diverse, 0.0 more conservative) - silent: disable progress bar - - Returns: - numpy semantic array to be fed into `semantic_to_waveform` - """ - x_semantic = generate_text_semantic( - text, - history_prompt=history_prompt, - temp=temp, - silent=silent, - ) - return x_semantic - -.. code:: ipython3 - - from bark.generation import ( - _load_history_prompt, - _tokenize, - _normalize_whitespace, - TEXT_PAD_TOKEN, - TEXT_ENCODING_OFFSET, - SEMANTIC_VOCAB_SIZE, - SEMANTIC_PAD_TOKEN, - SEMANTIC_INFER_TOKEN, - COARSE_RATE_HZ, - SEMANTIC_RATE_HZ, - N_COARSE_CODEBOOKS, - COARSE_INFER_TOKEN, - CODEBOOK_SIZE, - N_FINE_CODEBOOKS, - COARSE_SEMANTIC_PAD_TOKEN, - ) - import torch.nn.functional as F - from typing import List, Optional, Union, Dict - - - def generate_text_semantic( - text: str, - history_prompt: List[str] = None, - temp: float = 0.7, - top_k: int = None, - top_p: float = None, - silent: bool = False, - min_eos_p: float = 0.2, - max_gen_duration_s: int = None, - allow_early_stop: bool = True, - ): - """ - Generate semantic tokens from text. - Args: - text: text to be turned into audio - history_prompt: history choice for audio cloning - temp: generation temperature (1.0 more diverse, 0.0 more conservative) - top_k: top k number of probabilities for considering during generation - top_p: top probabilities higher than p for considering during generation - silent: disable progress bar - min_eos_p: minimum probability to select end of string token - max_gen_duration_s: maximum duration for generation in seconds - allow_early_stop: allow to stop generation if maximum duration is not reached - Returns: - numpy semantic array to be fed into `semantic_to_waveform` - - """ - text = _normalize_whitespace(text) - if history_prompt is not None: - history_prompt = _load_history_prompt(history_prompt) - semantic_history = history_prompt["semantic_prompt"] - else: - semantic_history = None - encoded_text = np.ascontiguousarray(_tokenize(tokenizer, text)) + TEXT_ENCODING_OFFSET - if len(encoded_text) > 256: - encoded_text = encoded_text[:256] - encoded_text = np.pad( - encoded_text, - (0, 256 - len(encoded_text)), - constant_values=TEXT_PAD_TOKEN, - mode="constant", - ) - if semantic_history is not None: - semantic_history = semantic_history.astype(np.int64) - # lop off if history is too long, pad if needed - semantic_history = semantic_history[-256:] - semantic_history = np.pad( - semantic_history, - (0, 256 - len(semantic_history)), - constant_values=SEMANTIC_PAD_TOKEN, - mode="constant", - ) - else: - semantic_history = np.array([SEMANTIC_PAD_TOKEN] * 256) - x = np.hstack([encoded_text, semantic_history, np.array([SEMANTIC_INFER_TOKEN])]).astype(np.int64)[None] - assert x.shape[1] == 256 + 256 + 1 - n_tot_steps = 768 - # custom tqdm updates since we don't know when eos will occur - pbar = tqdm.tqdm(disable=silent, total=100) - pbar_state = 0 - tot_generated_duration_s = 0 - kv_cache = None - for n in range(n_tot_steps): - if kv_cache is not None: - x_input = x[:, [-1]] - else: - x_input = x - logits, kv_cache = ov_text_model(ov.Tensor(x_input), kv_cache) - relevant_logits = logits[0, 0, :SEMANTIC_VOCAB_SIZE] - if allow_early_stop: - relevant_logits = np.hstack((relevant_logits, logits[0, 0, [SEMANTIC_PAD_TOKEN]])) # eos - if top_p is not None: - sorted_indices = np.argsort(relevant_logits)[::-1] - sorted_logits = relevant_logits[sorted_indices] - cumulative_probs = np.cumsum(F.softmax(sorted_logits)) - sorted_indices_to_remove = cumulative_probs > top_p - sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy() - sorted_indices_to_remove[0] = False - relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf - relevant_logits = torch.from_numpy(relevant_logits) - if top_k is not None: - relevant_logits = torch.from_numpy(relevant_logits) - v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1))) - relevant_logits[relevant_logits < v[-1]] = -float("Inf") - probs = F.softmax(torch.from_numpy(relevant_logits) / temp, dim=-1) - item_next = torch.multinomial(probs, num_samples=1) - if allow_early_stop and (item_next == SEMANTIC_VOCAB_SIZE or (min_eos_p is not None and probs[-1] >= min_eos_p)): - # eos found, so break - pbar.update(100 - pbar_state) - break - x = torch.cat((torch.from_numpy(x), item_next[None]), dim=1).numpy() - tot_generated_duration_s += 1 / SEMANTIC_RATE_HZ - if max_gen_duration_s is not None and tot_generated_duration_s > max_gen_duration_s: - pbar.update(100 - pbar_state) - break - if n == n_tot_steps - 1: - pbar.update(100 - pbar_state) - break - del logits, relevant_logits, probs, item_next - req_pbar_state = np.min([100, int(round(100 * n / n_tot_steps))]) - if req_pbar_state > pbar_state: - pbar.update(req_pbar_state - pbar_state) - pbar_state = req_pbar_state - pbar.close() - out = x.squeeze()[256 + 256 + 1 :] - return out - -.. code:: ipython3 - - def semantic_to_waveform( - semantic_tokens: np.ndarray, - history_prompt: Optional[Union[Dict, str]] = None, - temp: float = 0.7, - silent: bool = False, - ): - """Generate audio array from semantic input. - - Args: - semantic_tokens: semantic token output from `text_to_semantic` - history_prompt: history choice for audio cloning - temp: generation temperature (1.0 more diverse, 0.0 more conservative) - silent: disable progress bar - - Returns: - numpy audio array at sample frequency 24khz - """ - coarse_tokens = generate_coarse( - semantic_tokens, - history_prompt=history_prompt, - temp=temp, - silent=silent, - ) - fine_tokens = generate_fine( - coarse_tokens, - history_prompt=history_prompt, - temp=0.5, - ) - audio_arr = codec_decode(fine_tokens) - return audio_arr - -.. code:: ipython3 - - def generate_coarse( - x_semantic: np.ndarray, - history_prompt: Optional[Union[Dict, str]] = None, - temp: float = 0.7, - top_k: int = None, - top_p: float = None, - silent: bool = False, - max_coarse_history: int = 630, # min 60 (faster), max 630 (more context) - sliding_window_len: int = 60, - ): - """ - Generate coarse audio codes from semantic tokens. - Args: - x_semantic: semantic token output from `text_to_semantic` - history_prompt: history prompt, will be prepened to generated if provided - temp: generation temperature (1.0 more diverse, 0.0 more conservative) - top_k: top k number of probabilities for considering during generation - top_p: top probabilities higher than p for considering during generation - silent: disable progress bar - max_coarse_history: threshold for cutting coarse history (minimum 60 for faster generation, maximum 630 for more context) - sliding_window_len: size of sliding window for generation cycle - Returns: - numpy audio array with coarse audio codes - - """ - semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS - max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio)) - if history_prompt is not None: - history_prompt = _load_history_prompt(history_prompt) - x_semantic_history = history_prompt["semantic_prompt"] - x_coarse_history = history_prompt["coarse_prompt"] - x_coarse_history = _flatten_codebooks(x_coarse_history) + SEMANTIC_VOCAB_SIZE - # trim histories correctly - n_semantic_hist_provided = np.min( - [ - max_semantic_history, - len(x_semantic_history) - len(x_semantic_history) % 2, - int(np.floor(len(x_coarse_history) / semantic_to_coarse_ratio)), - ] - ) - n_coarse_hist_provided = int(round(n_semantic_hist_provided * semantic_to_coarse_ratio)) - x_semantic_history = x_semantic_history[-n_semantic_hist_provided:].astype(np.int32) - x_coarse_history = x_coarse_history[-n_coarse_hist_provided:].astype(np.int32) - x_coarse_history = x_coarse_history[:-2] - else: - x_semantic_history = np.array([], dtype=np.int32) - x_coarse_history = np.array([], dtype=np.int32) - # start loop - n_steps = int(round(np.floor(len(x_semantic) * semantic_to_coarse_ratio / N_COARSE_CODEBOOKS) * N_COARSE_CODEBOOKS)) - x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32) - x_coarse = x_coarse_history.astype(np.int32) - base_semantic_idx = len(x_semantic_history) - x_semantic_in = x_semantic[None] - x_coarse_in = x_coarse[None] - n_window_steps = int(np.ceil(n_steps / sliding_window_len)) - n_step = 0 - for _ in tqdm.tqdm(range(n_window_steps), total=n_window_steps, disable=silent): - semantic_idx = base_semantic_idx + int(round(n_step / semantic_to_coarse_ratio)) - # pad from right side - x_in = x_semantic_in[:, np.max([0, semantic_idx - max_semantic_history]) :] - x_in = x_in[:, :256] - x_in = F.pad( - torch.from_numpy(x_in), - (0, 256 - x_in.shape[-1]), - "constant", - COARSE_SEMANTIC_PAD_TOKEN, - ) - x_in = torch.hstack( - [ - x_in, - torch.tensor([COARSE_INFER_TOKEN])[None], - torch.from_numpy(x_coarse_in[:, -max_coarse_history:]), - ] - ).numpy() - kv_cache = None - for _ in range(sliding_window_len): - if n_step >= n_steps: - continue - is_major_step = n_step % N_COARSE_CODEBOOKS == 0 - - if kv_cache is not None: - x_input = x_in[:, [-1]] - else: - x_input = x_in - - logits, kv_cache = ov_coarse_model(x_input, past_kv=kv_cache) - logit_start_idx = SEMANTIC_VOCAB_SIZE + (1 - int(is_major_step)) * CODEBOOK_SIZE - logit_end_idx = SEMANTIC_VOCAB_SIZE + (2 - int(is_major_step)) * CODEBOOK_SIZE - relevant_logits = logits[0, 0, logit_start_idx:logit_end_idx] - if top_p is not None: - sorted_indices = np.argsort(relevant_logits)[::-1] - sorted_logits = relevant_logits[sorted_indices] - cumulative_probs = np.cumsum(F.softmax(sorted_logits)) - sorted_indices_to_remove = cumulative_probs > top_p - sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy() - sorted_indices_to_remove[0] = False - relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf - relevant_logits = torch.from_numpy(relevant_logits) - if top_k is not None: - relevant_logits = torch.from_numpy(relevant_logits) - v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1))) - relevant_logits[relevant_logits < v[-1]] = -float("Inf") - probs = F.softmax(torch.from_numpy(relevant_logits) / temp, dim=-1) - item_next = torch.multinomial(probs, num_samples=1) - item_next = item_next - item_next += logit_start_idx - x_coarse_in = torch.cat((torch.from_numpy(x_coarse_in), item_next[None]), dim=1).numpy() - x_in = torch.cat((torch.from_numpy(x_in), item_next[None]), dim=1).numpy() - del logits, relevant_logits, probs, item_next - n_step += 1 - del x_in - del x_semantic_in - gen_coarse_arr = x_coarse_in.squeeze()[len(x_coarse_history) :] - del x_coarse_in - gen_coarse_audio_arr = gen_coarse_arr.reshape(-1, N_COARSE_CODEBOOKS).T - SEMANTIC_VOCAB_SIZE - for n in range(1, N_COARSE_CODEBOOKS): - gen_coarse_audio_arr[n, :] -= n * CODEBOOK_SIZE - return gen_coarse_audio_arr - - - def generate_fine( - x_coarse_gen: np.ndarray, - history_prompt: Optional[Union[Dict, str]] = None, - temp: float = 0.5, - silent: bool = True, - ): - """ - Generate full audio codes from coarse audio codes. - Args: - x_coarse_gen: generated coarse codebooks from `generate_coarse` - history_prompt: history prompt, will be prepended to generated - temp: generation temperature (1.0 more diverse, 0.0 more conservative) - silent: disable progress bar - Returns: - numpy audio array with coarse audio codes - - """ - if history_prompt is not None: - history_prompt = _load_history_prompt(history_prompt) - x_fine_history = history_prompt["fine_prompt"] - else: - x_fine_history = None - n_coarse = x_coarse_gen.shape[0] - # make input arr - in_arr = np.vstack( - [ - x_coarse_gen, - np.zeros((N_FINE_CODEBOOKS - n_coarse, x_coarse_gen.shape[1])) + CODEBOOK_SIZE, - ] - ).astype( - np.int32 - ) # padding - # prepend history if available (max 512) - if x_fine_history is not None: - x_fine_history = x_fine_history.astype(np.int32) - in_arr = np.hstack([x_fine_history[:, -512:].astype(np.int32), in_arr]) - n_history = x_fine_history[:, -512:].shape[1] - else: - n_history = 0 - n_remove_from_end = 0 - # need to pad if too short (since non-causal model) - if in_arr.shape[1] < 1024: - n_remove_from_end = 1024 - in_arr.shape[1] - in_arr = np.hstack( - [ - in_arr, - np.zeros((N_FINE_CODEBOOKS, n_remove_from_end), dtype=np.int32) + CODEBOOK_SIZE, - ] - ) - n_loops = np.max([0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))]) + 1 - in_arr = in_arr.T - for n in tqdm.tqdm(range(n_loops), disable=silent): - start_idx = np.min([n * 512, in_arr.shape[0] - 1024]) - start_fill_idx = np.min([n_history + n * 512, in_arr.shape[0] - 512]) - rel_start_fill_idx = start_fill_idx - start_idx - in_buffer = in_arr[start_idx : start_idx + 1024, :][None] - for nn in range(n_coarse, N_FINE_CODEBOOKS): - logits = ov_fine_model(np.array([nn]).astype(np.int64), in_buffer.astype(np.int64)) - if temp is None: - relevant_logits = logits[0, rel_start_fill_idx:, :CODEBOOK_SIZE] - codebook_preds = torch.argmax(relevant_logits, -1) - else: - relevant_logits = logits[0, :, :CODEBOOK_SIZE] / temp - probs = F.softmax(torch.from_numpy(relevant_logits), dim=-1) - codebook_preds = torch.hstack([torch.multinomial(probs[nnn], num_samples=1) for nnn in range(rel_start_fill_idx, 1024)]) - in_buffer[0, rel_start_fill_idx:, nn] = codebook_preds.numpy() - del logits, codebook_preds - for nn in range(n_coarse, N_FINE_CODEBOOKS): - in_arr[start_fill_idx : start_fill_idx + (1024 - rel_start_fill_idx), nn] = in_buffer[0, rel_start_fill_idx:, nn] - del in_buffer - gen_fine_arr = in_arr.squeeze().T - del in_arr - gen_fine_arr = gen_fine_arr[:, n_history:] - if n_remove_from_end > 0: - gen_fine_arr = gen_fine_arr[:, :-n_remove_from_end] - return gen_fine_arr - -Run model inference -------------------- - - - -Now is time to see model in action. We need only wrap our models to -classes and run ``generate_audio`` function. - -Select Inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - core = ov.Core() - - ov_text_model = OVBarkTextEncoder(core, device.value, text_encoder_path0, text_encoder_path1) - ov_coarse_model = OVBarkEncoder(core, device.value, coarse_encoder_path) - ov_fine_model = OVBarkFineEncoder(core, device.value, fine_model_dir) - -.. code:: ipython3 - - import time - from bark import SAMPLE_RATE - - torch.manual_seed(42) - t0 = time.time() - text = "Hello, my name is Suno. And, uh — and I like banana and apples. [laughs] But I also have other interests such as playing tic tac toe." - audio_array = generate_audio(text) - generation_duration_s = time.time() - t0 - audio_duration_s = audio_array.shape[0] / SAMPLE_RATE - - print(f"took {generation_duration_s:.0f}s to generate {audio_duration_s:.0f}s of audio") - - -.. parsed-literal:: - - 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:10<00:00, 9.63it/s] - 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:37<00:00, 1.17s/it] - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py:30: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm. - warnings.warn("torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.") - - -.. parsed-literal:: - - took 53s to generate 13s of audio - - -.. code:: ipython3 - - from IPython.display import Audio - from bark import SAMPLE_RATE - - Audio(audio_array, rate=SAMPLE_RATE) - - - - -.. raw:: html - - - - - - - -Interactive demo ----------------- - - - -.. code:: ipython3 - - import numpy as np - - from bark import SAMPLE_RATE - from bark.generation import SUPPORTED_LANGS - - AVAILABLE_PROMPTS = ["Unconditional", "Announcer"] - PROMPT_LOOKUP = {} - for _, lang in SUPPORTED_LANGS: - for n in range(10): - label = f"Speaker {n} ({lang})" - AVAILABLE_PROMPTS.append(label) - PROMPT_LOOKUP[label] = f"{lang}_speaker_{n}" - PROMPT_LOOKUP["Unconditional"] = None - PROMPT_LOOKUP["Announcer"] = "announcer" - - - def gen_tts(text, history_prompt): - history_prompt = PROMPT_LOOKUP[history_prompt] - audio_arr = generate_audio(text, history_prompt=history_prompt) - audio_arr = (audio_arr * 32767).astype(np.int16) - return (SAMPLE_RATE, audio_arr) - -.. code:: ipython3 - - import requests - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/bark-text-to-audio/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=gen_tts, available_prompts=AVAILABLE_PROMPTS) - - try: - demo.launch(debug=False) - except Exception: - demo.launch(share=True, debug=False) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/blip-visual-language-processing-with-output.rst b/docs/notebooks/blip-visual-language-processing-with-output.rst deleted file mode 100644 index aa01491b3d8d93..00000000000000 --- a/docs/notebooks/blip-visual-language-processing-with-output.rst +++ /dev/null @@ -1,1408 +0,0 @@ -Visual Question Answering and Image Captioning using BLIP and OpenVINO -====================================================================== - -Humans perceive the world through vision and language. A longtime goal -of AI is to build intelligent agents that can understand the world -through vision and language inputs to communicate with humans through -natural language. In order to achieve this goal, vision-language -pre-training has emerged as an effective approach, where deep neural -network models are pre-trained on large scale image-text datasets to -improve performance on downstream vision-language tasks, such as -image-text retrieval, image captioning, and visual question answering. - -`BLIP `__ is a language-image -pre-training framework for unified vision-language understanding and -generation. BLIP achieves state-of-the-art results on a wide range of -vision-language tasks. This tutorial demonstrates how to use BLIP for -visual question answering and image captioning. An additional part of -tutorial demonstrates how to speed up the model by applying 8-bit -post-training quantization and data free int8 weight compression from -`NNCF `__ (Neural Network -Compression Framework) to OpenVINO IR models and infer optimized BLIP -model via OpenVINO™ Toolkit. - -The tutorial consists of the following parts: - -1. Instantiate a BLIP model. -2. Convert the BLIP model to OpenVINO IR. -3. Run visual question answering and image captioning with OpenVINO. -4. Optimize BLIP model using NNCF -5. Compare original and optimized models -6. Launch interactive demo - - -**Table of contents:** - - -- `Background <#background>`__ - - - `Image Captioning <#image-captioning>`__ - - `Visual Question Answering <#visual-question-answering>`__ - -- `Instantiate Model <#instantiate-model>`__ -- `Convert Models to OpenVINO IR <#convert-models-to-openvino-ir>`__ - - - `Vision Model <#vision-model>`__ - - `Text Encoder <#text-encoder>`__ - - `Text Decoder <#text-decoder>`__ - -- `Run OpenVINO Model <#run-openvino-model>`__ - - - `Prepare Inference Pipeline <#prepare-inference-pipeline>`__ - - `Select inference device <#select-inference-device>`__ - - `Image Captioning <#image-captioning>`__ - - `Question Answering <#question-answering>`__ - -- `Optimize model using NNCF <#optimize-model-using-nncf>`__ - - - `Prepare dataset <#prepare-dataset>`__ - - `Quantize vision model <#quantize-vision-model>`__ - - `Quantize text encoder <#quantize-text-encoder>`__ - - `Compress weights of text - decoder <#compress-weights-of-text-decoder>`__ - - `Run optimized OpenVINO model <#run-optimized-openvino-model>`__ - - - `Image captioning <#image-captioning>`__ - - `Question answering <#question-answering>`__ - - - `Compare file sizes <#compare-file-sizes>`__ - - `Compare inference time of the FP16 and optimized - models <#compare-inference-time-of-the-fp16-and-optimized-models>`__ - -- `Interactive demo <#interactive-demo>`__ - - - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Background ----------- - - - -Visual language processing is a branch of artificial intelligence that -focuses on creating algorithms designed to enable computers to more -accurately understand images and their content. - -Popular tasks include: - -- **Text to Image Retrieval** - a semantic task that aims to find the - most relevant image for a given text description. -- **Image Captioning** - a semantic task that aims to provide a text - description for image content. -- **Visual Question Answering** - a semantic task that aims to answer - questions based on image content. - -As shown in the diagram below, these three tasks differ in the input -provided to the AI system. For text-to-image retrieval, you have a -predefined gallery of images for search and a user-requested text -description (query). Image captioning can be represented as a particular -case of visual question answering, where you have a predefined question -“What is in the picture?” and various images provided by a user. For -visual question answering, both the text-based question and image -context are variables requested by a user. - -|image0| - -This notebook does not focus on Text to Image retrieval. Instead, it -considers Image Captioning and Visual Question Answering. - -Image Captioning -~~~~~~~~~~~~~~~~ - - - -Image Captioning is the task of describing the content of an image in -words. This task lies at the intersection of computer vision and natural -language processing. Most image captioning systems use an -encoder-decoder framework, where an input image is encoded into an -intermediate representation of the information in the image, and then -decoded into a descriptive text sequence. - -|image1| - -Visual Question Answering -~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Visual Question Answering (VQA) is the task of answering text-based -questions about image content. - -|image2| - -For a better understanding of how VQA works, let us consider a -traditional NLP task like Question Answering, which aims to retrieve the -answer to a question from a given text input. Typically, a question -answering pipeline consists of three steps: - -|image3| - -1. Question analysis - analysis of provided question in natural language - form to understand the object in the question and additional context. - For example, if you have a question like “How many bridges in - Paris?”, question words *“how many”* gives a hint that the answer is - more likely to be a number, *“bridges”* is the target object of the - question and *" in Paris"* serves as additional context for the - search. -2. Build query for search - use analyzed results to formalize query for - finding the most relevant information. -3. Perform a search in the knowledge base - send the query to a - knowledge base, typically provided text documents or databases serve - as a source of knowledge. - -|image4| - -The difference between text-based question answering and visual question -answering is that an image is used as context and the knowledge base. - -|image5| - -Answering arbitrary questions about images is a complex problem because -it requires involving a lot of computer vision sub-tasks. In the table -below, you can find an example of questions and the required computer -vision skills to find answers. - -+-----------------------------+----------------------------------------+ -| Computer vision task | Question examples | -+=============================+========================================+ -| Object recognition | What is shown in the picture? What is | -| | it? | -+-----------------------------+----------------------------------------+ -| Object detection | Is there any object (dog, man, book) | -| | in the image? Where is … located? | -+-----------------------------+----------------------------------------+ -| Object and image attribute | What color is an umbrella? Does this | -| recognition | man wear glasses? Is there color in | -| | the image? | -+-----------------------------+----------------------------------------+ -| Scene recognition | Is it rainy? What celebration is | -| | pictured? | -+-----------------------------+----------------------------------------+ -| Object counting | How many players are there on the | -| | football field? How many steps are | -| | there on the stairs? | -+-----------------------------+----------------------------------------+ -| Activity recognition | Is the baby crying? What is the woman | -| | cooking? What are they doing? | -+-----------------------------+----------------------------------------+ -| Spatial relationships among | What is located between the sofa and | -| objects | the armchair? What is in the bottom | -| | left corner? | -+-----------------------------+----------------------------------------+ -| Commonsense reasoning | Does she have 100% vision? Does this | -| | person have children? | -+-----------------------------+----------------------------------------+ -| Knowledge-based reasoning | Is it a vegetarian pizza? | -+-----------------------------+----------------------------------------+ -| Text recognition | What is the title of the book? What is | -| | shown on the screen? | -+-----------------------------+----------------------------------------+ - -There are a lot of applications for visual question answering: - -- Aid Visually Impaired Persons: VQA models can be used to reduce - barriers for visually impaired people by helping them get information - about images from the web and the real world. -- Education: VQA models can be used to improve visitor experiences at - museums by enabling observers to directly ask questions they are - interested in or to bring more interactivity to schoolbooks for - children interested in acquiring specific knowledge. -- E-commerce: VQA models can retrieve information about products using - photos from online stores. -- Independent expert assessment: VQA models can be provide objective - assessments in sports competitions, medical diagnosis, and forensic - examination. - -.. |image0| image:: https://user-images.githubusercontent.com/29454499/221755717-a5b51b7e-523c-461f-b30c-4edbfaf9a134.png -.. |image1| image:: https://user-images.githubusercontent.com/29454499/221640847-1868117c-aac0-4806-99a4-34f218e98bb8.png -.. |image2| image:: https://user-images.githubusercontent.com/29454499/221641984-3c6d8b2f-dd0d-4302-a4d8-0f8564fca772.png -.. |image3| image:: https://user-images.githubusercontent.com/29454499/221760881-378f1ea8-eadc-4610-aff0-69ecabf62fff.png -.. |image4| image:: https://user-images.githubusercontent.com/29454499/222094861-3cafdf9f-d700-4741-b6c5-fb09c1a4da9a.png -.. |image5| image:: https://user-images.githubusercontent.com/29454499/222095118-3d5826e4-2662-4d1c-abf2-a515f23d6d6a.png - -Instantiate Model ------------------ - - - -The BLIP model was proposed in the `BLIP: Bootstrapping Language-Image -Pre-training for Unified Vision-Language Understanding and -Generation `__ paper. - -.. figure:: https://github.com/salesforce/BLIP/raw/main/BLIP.gif - :alt: blip.gif - - blip.gif - -To pre-train a unified vision-language model with both understanding and -generation capabilities, BLIP introduces a multimodal mixture of an -encoder-decoder and a multi-task model which can operate in one of the -three modes: - -- **Unimodal encoders**, which separately encode images and text. The - image encoder is a vision transformer. The text encoder is the same - as BERT. -- **Image-grounded text encoder**, which injects visual information by - inserting a cross-attention layer between the self-attention layer - and the feed-forward network for each transformer block of the text - encoder. -- **Image-grounded text decoder**, which replaces the bi-directional - self-attention layers in the text encoder with causal self-attention - layers. - -More details about the model can be found in the `research -paper `__, `Salesforce -blog `__, -`GitHub repo `__ and `Hugging Face -model -documentation `__. - -In this tutorial, you will use the -`blip-vqa-base `__ -model available for download from `Hugging -Face `__. The same actions are also applicable -to other similar models from the BLIP family. Although this model class -is designed to perform question answering, its components can also be -reused for image captioning. - -To start working with the model, you need to instantiate the -``BlipForQuestionAnswering`` class, using ``from_pretrained`` method. -``BlipProcessor`` is a helper class for preparing input data for both -text and vision modalities and postprocessing of generation results. - -.. code:: ipython3 - - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "torch>=2.1.0" torchvision "transformers>=4.26.0" "gradio>=4.19" "openvino>=2023.3.0" "datasets>=2.14.6" "nncf>=2.8.1" "tqdm" - %pip install -q "matplotlib>=3.4" - -.. code:: ipython3 - - import time - from PIL import Image - from transformers import BlipProcessor, BlipForQuestionAnswering - from pathlib import Path - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file, device_widget, quantization_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("blip-visual-language-processing.ipynb") - - # get model and processor - processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") - model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") - - if not Path("demo.jpg").exists(): - # setup test input: download and read image, prepare question - img_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg" - download_file(img_url, "demo.jpg") - raw_image = Image.open("demo.jpg").convert("RGB") - question = "how many dogs are in the picture?" - # preprocess input data - inputs = processor(raw_image, question, return_tensors="pt") - - start = time.perf_counter() - # perform generation - out = model.generate(**inputs) - end = time.perf_counter() - start - - # postprocess result - answer = processor.decode(out[0], skip_special_tokens=True) - -.. code:: ipython3 - - print(f"Processing time: {end:.4f} s") - - -.. parsed-literal:: - - Processing time: 0.3707 s - - -.. code:: ipython3 - - from pathlib import Path - - if not Path("./utils.py").exists(): - download_file(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/blip-visual-language-processing/utils.py") - from utils import visualize_results - - fig = visualize_results(raw_image, answer, question) - - - -.. image:: blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_7_0.png - - -Convert Models to OpenVINO IR ------------------------------ - - - -Starting from OpenVINO 2023.0 release, OpenVINO supports direct PyTorch -models conversion to OpenVINO Intermediate Representation (IR) format to -take the advantage of advanced OpenVINO optimization tools and features. -You need to provide a model object, input data for model tracing to -OpenVINO Model Conversion API. ``ov.convert_model`` function convert -PyTorch model instance to ``ov.Model`` object that can be used for -compilation on device or saved on disk using ``ov.save_model`` in -compressed to FP16 format. - -The model consists of three parts: - -- vision_model - an encoder for image representation. -- text_encoder - an encoder for input query, used for question - answering and text-to-image retrieval only. -- text_decoder - a decoder for output answer. - -To be able to perform multiple tasks, using the same model components, -you should convert each part independently. - -Vision Model -~~~~~~~~~~~~ - - - -The vision model accepts float input tensors with the [1,3,384,384] -shape, containing RGB image pixel values normalized in the [0,1] range. - -.. code:: ipython3 - - import torch - from pathlib import Path - import openvino as ov - - VISION_MODEL_OV = Path("blip_vision_model.xml") - vision_model = model.vision_model - vision_model.eval() - - # check that model works and save it outputs for reusage as text encoder input - with torch.no_grad(): - vision_outputs = vision_model(inputs["pixel_values"]) - - # if openvino model does not exist, convert it to IR - if not VISION_MODEL_OV.exists(): - # export pytorch model to ov.Model - with torch.no_grad(): - ov_vision_model = ov.convert_model(vision_model, example_input=inputs["pixel_values"]) - # save model on disk for next usages - ov.save_model(ov_vision_model, VISION_MODEL_OV) - print(f"Vision model successfuly converted and saved to {VISION_MODEL_OV}") - else: - print(f"Vision model will be loaded from {VISION_MODEL_OV}") - - -.. parsed-literal:: - - /home/ltalamanova/tmp_venv/lib/python3.11/site-packages/transformers/modeling_utils.py:4225: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead - warnings.warn( - - -.. parsed-literal:: - - Vision model successfuly converted and saved to blip_vision_model.xml - - -Text Encoder -~~~~~~~~~~~~ - - - -The text encoder is used by visual question answering tasks to build a -question embedding representation. It takes ``input_ids`` with a -tokenized question and output image embeddings obtained from the vision -model and attention masks for them. - -.. code:: ipython3 - - TEXT_ENCODER_OV = Path("blip_text_encoder.xml") - - - text_encoder = model.text_encoder - text_encoder.eval() - - # if openvino model does not exist, convert it to IR - if not TEXT_ENCODER_OV.exists(): - # prepare example inputs - image_embeds = vision_outputs[0] - image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long) - input_dict = { - "input_ids": inputs["input_ids"], - "attention_mask": inputs["attention_mask"], - "encoder_hidden_states": image_embeds, - "encoder_attention_mask": image_attention_mask, - } - # export PyTorch model - with torch.no_grad(): - ov_text_encoder = ov.convert_model(text_encoder, example_input=input_dict) - # save model on disk for next usages - ov.save_model(ov_text_encoder, TEXT_ENCODER_OV) - print(f"Text encoder successfuly converted and saved to {TEXT_ENCODER_OV}") - else: - print(f"Text encoder will be loaded from {TEXT_ENCODER_OV}") - - -.. parsed-literal:: - - Text encoder successfuly converted and saved to blip_text_encoder.xml - - -Text Decoder -~~~~~~~~~~~~ - - - -The text decoder is responsible for generating the sequence of tokens to -represent model output (answer to question or caption), using an image -(and question, if required) representation. The generation approach is -based on the assumption that the probability distribution of a word -sequence can be decomposed into the product of conditional next word -distributions. In other words, model predicts the next token in the loop -guided by previously generated tokens until the stop-condition will be -not reached (generated sequence of maximum length or end of string token -obtained). The way the next token will be selected over predicted -probabilities is driven by the selected decoding methodology. You can -find more information about the most popular decoding methods in this -`blog `__. The entry point -for the generation process for models from the Hugging Face Transformers -library is the ``generate`` method. You can find more information about -its parameters and configuration in the -`documentation `__. -To preserve flexibility in the selection decoding methodology, you will -convert only model inference for one step. - -To optimize the generation process and use memory more efficiently, the -``use_cache=True`` option is enabled. Since the output side is -auto-regressive, an output token hidden state remains the same once -computed for every further generation step. Therefore, recomputing it -every time you want to generate a new token seems wasteful. With the -cache, the model saves the hidden state once it has been computed. The -model only computes the one for the most recently generated output token -at each time step, re-using the saved ones for hidden tokens. This -reduces the generation complexity from O(n^3) to O(n^2) for a -transformer model. More details about how it works can be found in this -`article `__. -With this option, the model gets the previous step’s hidden states as -input and additionally provides hidden states for the current step as -output. Initially, you have no previous step hidden states, so the first -step does not require you to provide them, but we should initialize them -by default values. In PyTorch, past hidden state outputs are represented -as a list of pairs (hidden state for key, hidden state for value] for -each transformer layer in the model. OpenVINO model does not support -nested outputs, they will be flattened. - -Similar to ``text_encoder``, ``text_decoder`` can work with input -sequences of different lengths and requires preserving dynamic input -shapes. - -.. code:: ipython3 - - text_decoder = model.text_decoder - text_decoder.eval() - - TEXT_DECODER_OV = Path("blip_text_decoder_with_past.xml") - - # prepare example inputs - input_ids = torch.tensor([[30522]]) # begin of sequence token id - attention_mask = torch.tensor([[1]]) # attention mask for input_ids - encoder_hidden_states = torch.rand((1, 10, 768)) # encoder last hidden state from text_encoder - encoder_attention_mask = torch.ones((1, 10), dtype=torch.long) # attention mask for encoder hidden states - - input_dict = { - "input_ids": input_ids, - "attention_mask": attention_mask, - "encoder_hidden_states": encoder_hidden_states, - "encoder_attention_mask": encoder_attention_mask, - } - text_decoder_outs = text_decoder(**input_dict) - # extend input dictionary with hidden states from previous step - input_dict["past_key_values"] = text_decoder_outs["past_key_values"] - - text_decoder.config.torchscript = True - if not TEXT_DECODER_OV.exists(): - # export PyTorch model - with torch.no_grad(): - ov_text_decoder = ov.convert_model(text_decoder, example_input=input_dict) - # save model on disk for next usages - ov.save_model(ov_text_decoder, TEXT_DECODER_OV) - print(f"Text decoder successfuly converted and saved to {TEXT_DECODER_OV}") - else: - print(f"Text decoder will be loaded from {TEXT_DECODER_OV}") - - -.. parsed-literal:: - - /home/ltalamanova/tmp_venv/lib/python3.11/site-packages/transformers/models/blip/modeling_blip_text.py:635: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if causal_mask.shape[1] < attention_mask.shape[1]: - /home/ltalamanova/tmp_venv/lib/python3.11/site-packages/torch/jit/_trace.py:165: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at aten/src/ATen/core/TensorBody.h:489.) - if a.grad is not None: - - -.. parsed-literal:: - - Text decoder successfuly converted and saved to blip_text_decoder_with_past.xml - - -Run OpenVINO Model ------------------- - - - -Prepare Inference Pipeline -~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -As discussed before, the model consists of several blocks which can be -reused for building pipelines for different tasks. In the diagram below, -you can see how image captioning works: - -|image01| - -The visual model accepts the image preprocessed by ``BlipProcessor`` as -input and produces image embeddings, which are directly passed to the -text decoder for generation caption tokens. When generation is finished, -output sequence of tokens is provided to ``BlipProcessor`` for decoding -to text using a tokenizer. - -The pipeline for question answering looks similar, but with additional -question processing. In this case, image embeddings and question -tokenized by ``BlipProcessor`` are provided to the text encoder and then -multimodal question embedding is passed to the text decoder for -performing generation of answers. - -|image11| - -The next step is implementing both pipelines using OpenVINO models. - -.. |image01| image:: https://user-images.githubusercontent.com/29454499/221865836-a56da06e-196d-449c-a5dc-4136da6ab5d5.png -.. |image11| image:: https://user-images.githubusercontent.com/29454499/221868167-d0081add-d9f3-4591-80e7-4753c88c1d0a.png - -.. code:: ipython3 - - # create OpenVINO Core object instance - core = ov.Core() - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device = device_widget() - - device - - -.. parsed-literal:: - - huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... - To disable this warning, you can either: - - Avoid using `tokenizers` before the fork if possible - - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=4, options=('CPU', 'GPU.0', 'GPU.1', 'GPU.2', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - # load models on device - ov_vision_model = core.compile_model(VISION_MODEL_OV, device.value) - ov_text_encoder = core.compile_model(TEXT_ENCODER_OV, device.value) - ov_text_decoder_with_past = core.compile_model(TEXT_DECODER_OV, device.value) - -.. code:: ipython3 - - from functools import partial - from blip_model import text_decoder_forward - - text_decoder.forward = partial(text_decoder_forward, ov_text_decoder_with_past=ov_text_decoder_with_past) - -The model helper class has two methods for generation: -**generate_answer** - used for visual question answering, -**generate_caption** - used for caption generation. For initialization, -model class accepts compiled OpenVINO models for the text encoder, -vision model and text decoder, and also configuration for generation and -initial token for decoder work. - -.. code:: ipython3 - - if not Path("./blip_model.py").exists(): - download_file(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/blip-visual-language-processing/blip_model.py") - from blip_model import OVBlipModel - - ov_model = OVBlipModel(model.config, model.decoder_start_token_id, ov_vision_model, ov_text_encoder, text_decoder) - out = ov_model.generate_answer(**inputs, max_length=20) - -Now, the model is ready for generation. - -Image Captioning -~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - out = ov_model.generate_caption(inputs["pixel_values"], max_length=20) - caption = processor.decode(out[0], skip_special_tokens=True) - fig = visualize_results(raw_image, caption) - - - -.. image:: blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_25_0.png - - -Question Answering -~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - start = time.perf_counter() - out = ov_model.generate_answer(**inputs, max_length=20) - end = time.perf_counter() - start - answer = processor.decode(out[0], skip_special_tokens=True) - fig = visualize_results(raw_image, answer, question) - - - -.. image:: blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_27_0.png - - -.. code:: ipython3 - - print(f"Processing time: {end:.4f}") - - -.. parsed-literal:: - - Processing time: 0.1186 - - -Optimize model using NNCF -------------------------- - - - -`NNCF `__ enables -post-training quantization by adding the quantization layers into the -model graph and then using a subset of the training dataset to -initialize the parameters of these additional quantization layers. The -framework is designed so that modifications to your original training -code are minor. - -The optimization process contains the following steps: - -1. Create a dataset for quantization. -2. Run ``nncf.quantize`` to get a quantized model from the pre-trained - ``FP16`` model. -3. Serialize the ``INT8`` model using ``openvino.save_model`` function. - -.. - - **NOTE**: Quantization is time and memory consuming operation. - Running quantization code below may take some time. You can disable - it using widget below: - -.. code:: ipython3 - - to_quantize = quantization_widget() - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -.. code:: ipython3 - - VISION_MODEL_OV_INT8 = Path(str(VISION_MODEL_OV).replace(".xml", "_int8.xml")) - TEXT_ENCODER_OV_INT8 = Path(str(TEXT_ENCODER_OV).replace(".xml", "_int8.xml")) - TEXT_DECODER_OV_INT8 = Path(str(TEXT_DECODER_OV).replace(".xml", "_int8.xml")) - int8_model = None - - # Fetch skip_kernel_extension module - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - %load_ext skip_kernel_extension - -Prepare dataset -~~~~~~~~~~~~~~~ - - - -The `VQAv2 `__ is a dataset containing -open-ended questions about images. These questions require an -understanding of vision, language and commonsense knowledge to answer. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import aiohttp - - import numpy as np - from datasets import load_dataset - from tqdm.notebook import tqdm - - def preprocess_batch(batch, vision_model, inputs_info): - """ - Preprocesses a dataset batch by loading and transforming image and text data. - VQAv2 dataset contains multiple questions to image. - To reduce dataset preparation time we will store preprocessed images in `inputs_info`. - """ - image_id = batch["image_id"] - if image_id in inputs_info: - inputs = processor(text=batch['question'], return_tensors="np") - pixel_values = inputs_info[image_id]["pixel_values"] - encoder_hidden_states = inputs_info[image_id]["encoder_hidden_states"] - else: - inputs = processor(images=batch["image"], text=batch["question"], return_tensors="np") - pixel_values = inputs["pixel_values"] - encoder_hidden_states = vision_model(pixel_values)[vision_model.output(0)] - inputs_info[image_id] = { - "pixel_values": pixel_values, - "encoder_hidden_states": encoder_hidden_states, - "text_encoder_inputs": [] - } - - text_encoder_inputs = { - "input_ids": inputs["input_ids"], - "attention_mask": inputs["attention_mask"] - } - inputs_info[image_id]["text_encoder_inputs"].append(text_encoder_inputs) - - - def prepare_input_data(dataloader, vision_model, opt_init_steps): - """ - Store calibration subset in List to reduce quantization time. - """ - inputs_info = {} - for idx, batch in enumerate(tqdm(dataloader, total=opt_init_steps, desc="Prepare calibration data")): - preprocess_batch(batch, vision_model, inputs_info) - - calibration_subset = [] - for image_id in inputs_info: - pixel_values = inputs_info[image_id]["pixel_values"] - encoder_hidden_states = inputs_info[image_id]["encoder_hidden_states"] - encoder_attention_mask = np.ones(encoder_hidden_states.shape[:-1], dtype=int) - for text_encoder_inputs in inputs_info[image_id]["text_encoder_inputs"]: - text_encoder_inputs["encoder_hidden_states"] = encoder_hidden_states - text_encoder_inputs["encoder_attention_mask"] = encoder_attention_mask - blip_inputs = { - "vision_model_inputs": {"pixel_values": pixel_values}, - "text_encoder_inputs": text_encoder_inputs, - } - calibration_subset.append(blip_inputs) - return calibration_subset - - - def prepare_dataset(vision_model, opt_init_steps=300, streaming=False): - """ - Prepares a vision-text dataset for quantization. - """ - split = f"train[:{opt_init_steps}]" if not streaming else "train" - dataset = load_dataset( - "HuggingFaceM4/VQAv2", - split=split, - streaming=streaming, - trust_remote_code=True, - storage_options={'client_kwargs': {'timeout': aiohttp.ClientTimeout(total=3600)}} - ) - dataset = dataset.shuffle(seed=42) - if streaming: - dataset = dataset.take(opt_init_steps) - calibration_subset = prepare_input_data(dataset, vision_model, opt_init_steps) - return calibration_subset - -Loading and processing the dataset in streaming mode may take a long -time and depends on your internet connection. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import nncf - import gc - - if not VISION_MODEL_OV_INT8.exists() or not TEXT_ENCODER_OV_INT8.exists(): - comp_vision_model = core.compile_model(VISION_MODEL_OV, device.value) - calibration_data = prepare_dataset(comp_vision_model) - del comp_vision_model - gc.collect(); - -Quantize vision model -~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - if not VISION_MODEL_OV_INT8.exists(): - vision_dataset = nncf.Dataset(calibration_data, lambda x: x["vision_model_inputs"]) - vision_model = core.read_model(VISION_MODEL_OV) - - quantized_model = nncf.quantize( - model=vision_model, - calibration_dataset=vision_dataset, - model_type=nncf.ModelType.TRANSFORMER - ) - - ov.save_model(quantized_model, VISION_MODEL_OV_INT8) - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - INFO:nncf:36 ignored nodes were found by name in the NNCFGraph - INFO:nncf:48 ignored nodes were found by name in the NNCFGraph - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -Quantize text encoder -~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - if not TEXT_ENCODER_OV_INT8.exists(): - text_encoder_dataset = nncf.Dataset(calibration_data, lambda x: x["text_encoder_inputs"]) - text_encoder_model = core.read_model(TEXT_ENCODER_OV) - - quantized_model = nncf.quantize( - model=text_encoder_model, - calibration_dataset=text_encoder_dataset, - model_type=nncf.ModelType.TRANSFORMER - ) - ov.save_model(quantized_model, TEXT_ENCODER_OV_INT8) - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - INFO:nncf:72 ignored nodes were found by name in the NNCFGraph - INFO:nncf:73 ignored nodes were found by name in the NNCFGraph - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -Compress weights of text decoder -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The quantization of the text decoder leads to significant accuracy loss. -Instead of post-training quantization, we can use data free weights -compression to reduce the model footprint. - -The optimization process contains the following steps: - -1. Run ``nncf.compress_weights`` to get a model with compressed weights. -2. Serialize the ``OpenVINO`` model using ``openvino.save_model`` - function. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - if not TEXT_DECODER_OV_INT8.exists(): - text_decoder_model = core.read_model(TEXT_DECODER_OV) - compressed_text_decoder = nncf.compress_weights(text_decoder_model) - ov.save_model(compressed_text_decoder, TEXT_DECODER_OV_INT8) - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - +--------------+---------------------------+-----------------------------------+ - | Num bits (N) | % all parameters (layers) | % ratio-defining parameters | - | | | (layers) | - +==============+===========================+===================================+ - | 8 | 100% (124 / 124) | 100% (124 / 124) | - +--------------+---------------------------+-----------------------------------+ - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -Run optimized OpenVINO model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The steps for making predictions with the optimized OpenVINO BLIP model -are similar to the PyTorch model. Let us check the model result using -the same input data like for model before quantization - -.. code:: ipython3 - - %%skip not $to_quantize.value - - q_ov_vision_model = core.compile_model(VISION_MODEL_OV_INT8, device.value) - q_ov_text_encoder = core.compile_model(TEXT_ENCODER_OV_INT8, device.value) - q_ov_text_decoder_with_past = core.compile_model(TEXT_DECODER_OV_INT8, device.value) - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from functools import partial - from transformers import BlipForQuestionAnswering - from blip_model import OVBlipModel, text_decoder_forward - - model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") - text_decoder = model.text_decoder - text_decoder.eval() - - text_decoder.forward = partial(text_decoder_forward, ov_text_decoder_with_past=q_ov_text_decoder_with_past) - int8_model = OVBlipModel(model.config, model.decoder_start_token_id, q_ov_vision_model, q_ov_text_encoder, text_decoder) - -.. code:: ipython3 - - %%skip not $to_quantize.value - - raw_image = Image.open("demo.jpg").convert('RGB') - question = "how many dogs are in the picture?" - # preprocess input data - inputs = processor(raw_image, question, return_tensors="pt") - -Image captioning -^^^^^^^^^^^^^^^^ - -.. code:: ipython3 - - %%skip not $to_quantize.value - - out = int8_model.generate_caption(inputs["pixel_values"], max_length=20) - caption = processor.decode(out[0], skip_special_tokens=True) - fig = visualize_results(raw_image, caption) - - - -.. image:: blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_47_0.png - - -Question answering -^^^^^^^^^^^^^^^^^^ - -.. code:: ipython3 - - %%skip not $to_quantize.value - - out = int8_model.generate_answer(**inputs, max_length=20) - answer = processor.decode(out[0], skip_special_tokens=True) - fig = visualize_results(raw_image, answer, question) - - - -.. image:: blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_49_0.png - - -Compare file sizes -~~~~~~~~~~~~~~~~~~ - -.. code:: ipython3 - - %%skip not $to_quantize.value - - def calculate_compression_rate(ov_model_path): - fp16_ir_model_size = Path(ov_model_path).with_suffix(".bin").stat().st_size / 1024 - int8_model_path = str(ov_model_path).replace(".xml", "_int8.xml") - quantized_model_size = Path(int8_model_path).with_suffix(".bin").stat().st_size / 1024 - print(f'{ov_model_path.as_posix().split(".")[0]}') - print(f" * FP16 IR model size: {fp16_ir_model_size:.2f} KB") - print(f" * INT8 model size: {quantized_model_size:.2f} KB") - print(f" * Model compression rate: {fp16_ir_model_size / quantized_model_size:.3f}") - -.. code:: ipython3 - - %%skip not $to_quantize.value - - for fp16_path in [VISION_MODEL_OV, TEXT_ENCODER_OV, TEXT_DECODER_OV]: - calculate_compression_rate(fp16_path) - - -.. parsed-literal:: - - blip_vision_model - * FP16 IR model size: 168145.70 KB - * INT8 model size: 84915.48 KB - * Model compression rate: 1.980 - blip_text_encoder - * FP16 IR model size: 268087.16 KB - * INT8 model size: 134676.75 KB - * Model compression rate: 1.991 - blip_text_decoder_with_past - * FP16 IR model size: 269303.35 KB - * INT8 model size: 135302.53 KB - * Model compression rate: 1.990 - - -Compare inference time of the FP16 and optimized models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To measure the inference performance of the ``FP16`` and ``INT8`` -models, we use median inference time on 100 samples of the calibration -dataset. So we can approximately estimate the speed up of the dynamic -quantized models. - - **NOTE**: For the most accurate performance estimation, it is - recommended to run ``benchmark_app`` in a terminal/command prompt - after closing other applications with static shapes. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import time - import torch - - def calculate_inference_time(blip_model, calibration_data, generate_caption): - inference_time = [] - for inputs in calibration_data: - pixel_values = torch.from_numpy(inputs["vision_model_inputs"]["pixel_values"]) - input_ids = torch.from_numpy(inputs["text_encoder_inputs"]["input_ids"]) - attention_mask = torch.from_numpy(inputs["text_encoder_inputs"]["attention_mask"]) - - start = time.perf_counter() - if generate_caption: - _ = blip_model.generate_caption(pixel_values, max_length=20) - else: - _ = blip_model.generate_answer(pixel_values=pixel_values, input_ids=input_ids, attention_mask=attention_mask, max_length=20) - end = time.perf_counter() - delta = end - start - inference_time.append(delta) - return np.median(inference_time) - -.. code:: ipython3 - - %%skip not $to_quantize.value - - fp_original_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") - fp_text_decoder = fp_original_model.text_decoder - fp_text_decoder.eval() - - comp_text_encoder = core.compile_model(TEXT_ENCODER_OV, device.value) - comp_text_decoder_with_past = core.compile_model(TEXT_DECODER_OV, device.value) - comp_vision_model = core.compile_model(VISION_MODEL_OV, device.value) - fp_text_decoder.forward = partial(text_decoder_forward, ov_text_decoder_with_past=comp_text_decoder_with_past) - fp16_model = OVBlipModel(model.config, model.decoder_start_token_id, comp_vision_model, comp_text_encoder, fp_text_decoder) - -.. code:: ipython3 - - %%skip not $to_quantize.value - - validation_data = calibration_data[:100] - - int8_caption_latency = calculate_inference_time(int8_model, validation_data, generate_caption=True) - fp16_caption_latency = calculate_inference_time(fp16_model, validation_data, generate_caption=True) - - print(f"Image Captioning speed up: {fp16_caption_latency / int8_caption_latency:.3f}") - - -.. parsed-literal:: - - Image Captioning speed up: 1.254 - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - int8_generate_answer_latency = calculate_inference_time(int8_model, validation_data, generate_caption=False) - fp16_generate_answer_latency = calculate_inference_time(fp16_model, validation_data, generate_caption=False) - print(f"Question Answering speed up: {fp16_generate_answer_latency / int8_generate_answer_latency:.3f}") - - -.. parsed-literal:: - - Question Answering speed up: 1.715 - - -Interactive demo ----------------- - - - -Please select below whether you would like to use the quantized model to -launch the interactive demo. - -.. code:: ipython3 - - import ipywidgets as widgets - - use_quantized_model = widgets.Checkbox( - description="Use quantized model", - value=int8_model is not None, - disabled=int8_model is None, - ) - - use_quantized_model - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Use quantized model') - - - -.. code:: ipython3 - - import gradio as gr - - ov_model = int8_model if use_quantized_model.value else ov_model - - - def generate_answer(img, question): - if img is None: - raise gr.Error("Please upload an image or choose one from the examples list") - start = time.perf_counter() - inputs = processor(img, question, return_tensors="pt") - output = ov_model.generate_answer(**inputs, max_length=20) if len(question) else ov_model.generate_caption(inputs["pixel_values"], max_length=20) - answer = processor.decode(output[0], skip_special_tokens=True) - elapsed = time.perf_counter() - start - html = f"

Processing time: {elapsed:.4f}

" - return answer, html - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/blip-visual-language-processing/gradio_helper.py" - ) - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=generate_answer) - - try: - demo.launch(debug=False) - except Exception: - demo.launch(share=True, debug=False) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_25_0.png b/docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_25_0.png deleted file mode 100644 index c90aa04a52d5e9..00000000000000 --- a/docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_25_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec8d01e06da8fec020483260e6bbe7ac04612ca64dc4642197e8966d20af384f -size 206675 diff --git a/docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_27_0.png b/docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_27_0.png deleted file mode 100644 index 7de75c31a7e3e4..00000000000000 --- a/docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_27_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4d6069f2803d8a4ea3d5855ed874b353af223ba5418714b3c5e637114ff206d5 -size 210267 diff --git a/docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_47_0.png b/docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_47_0.png deleted file mode 100644 index 1749467ee0739b..00000000000000 --- a/docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_47_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c4efd1fbdfcf5ae4b424316254235eb40a09d5716b6cff4aadb5b4f4b9eefe34 -size 205951 diff --git a/docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_49_0.png b/docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_49_0.png deleted file mode 100644 index 7de75c31a7e3e4..00000000000000 --- a/docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_49_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4d6069f2803d8a4ea3d5855ed874b353af223ba5418714b3c5e637114ff206d5 -size 210267 diff --git a/docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_7_0.png b/docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_7_0.png deleted file mode 100644 index 7de75c31a7e3e4..00000000000000 --- a/docs/notebooks/blip-visual-language-processing-with-output_files/blip-visual-language-processing-with-output_7_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4d6069f2803d8a4ea3d5855ed874b353af223ba5418714b3c5e637114ff206d5 -size 210267 diff --git a/docs/notebooks/catvton-with-output.rst b/docs/notebooks/catvton-with-output.rst deleted file mode 100644 index f6836f948e2a53..00000000000000 --- a/docs/notebooks/catvton-with-output.rst +++ /dev/null @@ -1,387 +0,0 @@ -Virtual Try-On with CatVTON and OpenVINO -======================================== - -Virtual try-on methods based on diffusion models achieve realistic -try-on effects but replicate the backbone network as a ReferenceNet or -leverage additional image encoders to process condition inputs, -resulting in high training and inference costs. `In this -work `__, authors rethink the necessity -of ReferenceNet and image encoders and innovate the interaction between -garment and person, proposing CatVTON, a simple and efficient virtual -try-on diffusion model. It facilitates the seamless transfer of in-shop -or worn garments of arbitrary categories to target persons by simply -concatenating them in spatial dimensions as inputs. The efficiency of -the model is demonstrated in three aspects: 1. Lightweight network. Only -the original diffusion modules are used, without additional network -modules. The text encoder and cross attentions for text injection in the -backbone are removed, further reducing the parameters by 167.02M. 2. -Parameter-efficient training. We identified the try-on relevant modules -through experiments and achieved high-quality try-on effects by training -only 49.57M parameters (∼5.51% of the backbone network’s parameters). 3. -Simplified inference. CatVTON eliminates all unnecessary conditions and -preprocessing steps, including pose estimation, human parsing, and text -input, requiring only garment reference, target person image, and mask -for the virtual try-on process. Extensive experiments demonstrate that -CatVTON achieves superior qualitative and quantitative results with -fewer prerequisites and trainable parameters than baseline methods. -Furthermore, CatVTON shows good generalization in in-the-wild scenarios -despite using open-source datasets with only 73K samples. - -Teaser image from `CatVTON -GitHub `__ |teaser| - -In this tutorial we consider how to convert and run this model using -OpenVINO. An additional part demonstrates how to run optimization with -`NNCF `__ to speed up -pipeline. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Convert the model to OpenVINO - IR <#convert-the-model-to-openvino-ir>`__ -- `Compiling models <#compiling-models>`__ -- `Optimize model using NNCF Post-Training Quantization - API <#optimize-model-using-nncf-post-training-quantization-api>`__ - - - `Run Post-Training - Quantization <#run-post-training-quantization>`__ - - `Run Weights Compression <#run-weights-compression>`__ - - `Compare model file sizes <#compare-model-file-sizes>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. |teaser| image:: https://github.com/Zheng-Chong/CatVTON/blob/edited/resource/img/teaser.jpg?raw=true - -Prerequisites -------------- - - - -.. code:: ipython3 - - import platform - - %pip install -q "openvino>=2024.4" "nncf>=2.13.0" - %pip install -q "torch>=2.1" "diffusers>=0.29.1" torchvision opencv_python --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q fvcore "pillow" "tqdm" "gradio>=4.36" "omegaconf==2.4.0.dev3" av pycocotools cloudpickle scipy accelerate "transformers>=4.27.3" - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("catvton.ipynb") - -.. code:: ipython3 - - from cmd_helper import clone_repo - - - clone_repo("https://github.com/Zheng-Chong/CatVTON.git", "3b795364a4d2f3b5adb365f39cdea376d20bc53c") - -Convert the model to OpenVINO IR -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate -Representation (IR). `OpenVINO model conversion -API `__ -should be used for these purposes. ``ov.convert_model`` function accepts -original PyTorch model instance and example input for tracing and -returns ``ov.Model`` representing this model in OpenVINO framework. -Converted model can be used for saving on disk using ``ov.save_model`` -function or directly loading on device using ``core.complie_model``. - -``ov_catvton_helper.py`` script contains helper function for models -downloading and models conversion, please check its content if you -interested in conversion details. - -To download checkpoints and load models, just call the helper function -``download_models``. It takes care about it. Functions -``convert_pipeline_models`` and ``convert_automasker_models`` will -convert models from pipeline and ``automasker`` in OpenVINO format. - -The original pipeline contains VAE encoder and decoder and UNET. -|CatVTON-overview| - -The ``automasker`` contains ``DensePose`` with -``detectron2.GeneralizedRCNN`` model and ``SCHP`` (``LIP`` and ``ATR`` -version). - -.. |CatVTON-overview| image:: https://github.com/user-attachments/assets/e35c8dab-1c54-47b1-a73b-2a62e6cdca7c - -.. code:: ipython3 - - from ov_catvton_helper import download_models, convert_pipeline_models, convert_automasker_models - - pipeline, mask_processor, automasker = download_models() - vae_scaling_factor = pipeline.vae.config.scaling_factor - convert_pipeline_models(pipeline) - convert_automasker_models(automasker) - -Compiling models ----------------- - - - -Select device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - import openvino as ov - - from notebook_utils import device_widget - - - core = ov.Core() - - device = device_widget() - - device - -``get_compiled_pipeline`` and ``get_compiled_automasker`` functions -defined in ``ov_catvton_helper.py`` provides convenient way for getting -the pipeline and the ``automasker`` with compiled ov-models that are -compatible with the original interface. It accepts the original pipeline -and ``automasker``, inference device and directories with converted -models as arguments. Under the hood we create callable wrapper classes -for compiled models to allow interaction with original pipelines. Note -that all of wrapper classes return ``torch.Tensor``\ s instead of -``np.array``\ s. And then insert wrappers instances in the pipeline. - -.. code:: ipython3 - - from ov_catvton_helper import ( - get_compiled_pipeline, - get_compiled_automasker, - VAE_ENCODER_PATH, - VAE_DECODER_PATH, - UNET_PATH, - DENSEPOSE_PROCESSOR_PATH, - SCHP_PROCESSOR_ATR, - SCHP_PROCESSOR_LIP, - ) - - pipeline = get_compiled_pipeline(pipeline, core, device, VAE_ENCODER_PATH, VAE_DECODER_PATH, UNET_PATH, vae_scaling_factor) - automasker = get_compiled_automasker(automasker, core, device, DENSEPOSE_PROCESSOR_PATH, SCHP_PROCESSOR_ATR, SCHP_PROCESSOR_LIP) - -Optimize model using NNCF Post-Training Quantization API --------------------------------------------------------- - - - -`NNCF `__ provides a suite of -advanced algorithms for Neural Networks inference optimization in -OpenVINO with minimal accuracy drop. We will use 8-bit quantization in -post-training mode (without the fine-tuning pipeline) for the UNet -model, and 4-bit weight compression for the remaining models. - - **NOTE**: Quantization is time and memory consuming operation. - Running quantization code below may take some time. You can disable - it using widget below: - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - to_quantize = quantization_widget() - - to_quantize - -Let’s load ``skip magic`` extension to skip quantization if -``to_quantize`` is not selected - -.. code:: ipython3 - - is_optimized_pipe_available = False - - # Fetch skip_kernel_extension module - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - %load_ext skip_kernel_extension - -Run Post-Training Quantization -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The optimization process contains the following steps: - -1. Create a Dataset for quantization. -2. Run ``nncf.quantize`` for getting an optimized model. -3. Serialize an OpenVINO IR model, using the ``openvino.save_model`` - function. - -We use a couple of images from the original repository as calibration -data. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from pathlib import Path - from catvton_quantization_helper import collect_calibration_data, UNET_INT8_PATH - - dataset = [ - ( - Path("CatVTON/resource/demo/example/person/men/model_5.png"), - Path("CatVTON/resource/demo/example/condition/upper/24083449_54173465_2048.jpg"), - ), - ( - Path("CatVTON/resource/demo/example/person/women/2-model_4.png"), - Path("CatVTON/resource/demo/example/condition/overall/21744571_51588794_1000.jpg"), - ), - ] - - if not UNET_INT8_PATH.exists(): - subset_size = 100 - calibration_data = collect_calibration_data(pipeline, automasker, mask_processor, dataset, subset_size) - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import gc - import nncf - from ov_catvton_helper import UNET_PATH - - # cleanup before quantization to free memory - del pipeline - del automasker - gc.collect() - - - if not UNET_INT8_PATH.exists(): - unet = core.read_model(UNET_PATH) - quantized_model = nncf.quantize( - model=unet, - calibration_dataset=nncf.Dataset(calibration_data), - subset_size=subset_size, - model_type=nncf.ModelType.TRANSFORMER, - ) - ov.save_model(quantized_model, UNET_INT8_PATH) - del quantized_model - gc.collect() - -Run Weights Compression -~~~~~~~~~~~~~~~~~~~~~~~ - - - -Quantizing of the remaining components of the pipeline does not -significantly improve inference performance but can lead to a -substantial degradation of accuracy. The weight compression will be -applied to footprint reduction. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from catvton_quantization_helper import compress_models - - compress_models(core) - - is_optimized_pipe_available = True - -Compare model file sizes -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - from catvton_quantization_helper import compare_models_size - - compare_models_size() - - -.. parsed-literal:: - - vae_encoder compression rate: 2.011 - vae_decoder compression rate: 2.007 - unet compression rate: 1.995 - densepose_processor compression rate: 2.019 - schp_processor_atr compression rate: 1.993 - schp_processor_lip compression rate: 1.993 - - -Interactive inference ---------------------- - - - -Please select below whether you would like to use the quantized models -to launch the interactive demo. - -.. code:: ipython3 - - from ov_catvton_helper import get_pipeline_selection_option - - use_quantized_models = get_pipeline_selection_option(is_optimized_pipe_available) - - use_quantized_models - -.. code:: ipython3 - - from gradio_helper import make_demo - - from catvton_quantization_helper import ( - VAE_ENCODER_INT4_PATH, - VAE_DECODER_INT4_PATH, - DENSEPOSE_PROCESSOR_INT4_PATH, - SCHP_PROCESSOR_ATR_INT4, - SCHP_PROCESSOR_LIP_INT4, - UNET_INT8_PATH, - ) - - pipeline, mask_processor, automasker = download_models() - if use_quantized_models.value: - pipeline = get_compiled_pipeline(pipeline, core, device, VAE_ENCODER_INT4_PATH, VAE_DECODER_INT4_PATH, UNET_INT8_PATH, vae_scaling_factor) - automasker = get_compiled_automasker(automasker, core, device, DENSEPOSE_PROCESSOR_INT4_PATH, SCHP_PROCESSOR_ATR_INT4, SCHP_PROCESSOR_LIP_INT4) - else: - pipeline = get_compiled_pipeline(pipeline, core, device, VAE_ENCODER_PATH, VAE_DECODER_PATH, UNET_PATH, vae_scaling_factor) - automasker = get_compiled_automasker(automasker, core, device, DENSEPOSE_PROCESSOR_PATH, SCHP_PROCESSOR_ATR, SCHP_PROCESSOR_LIP) - - output_dir = "output" - demo = make_demo(pipeline, mask_processor, automasker, output_dir) - try: - demo.launch(debug=True) - except Exception: - demo.launch(debug=True, share=True) diff --git a/docs/notebooks/clip-language-saliency-map-with-output.rst b/docs/notebooks/clip-language-saliency-map-with-output.rst deleted file mode 100644 index d7ed24684187db..00000000000000 --- a/docs/notebooks/clip-language-saliency-map-with-output.rst +++ /dev/null @@ -1,928 +0,0 @@ -Language-Visual Saliency with CLIP and OpenVINO™ -================================================ - -The notebook will cover the following topics: - -- Explanation of a *saliency map* and how it can be used. -- Overview of the CLIP neural network and its usage in generating - saliency maps. -- How to split a neural network into parts for separate inference. -- How to speed up inference with OpenVINO™ and asynchronous execution. - -Saliency Map ------------- - -A saliency map is a visualization technique that highlights regions of -interest in an image. For example, it can be used to `explain image -classification -predictions `__ -for a particular label. Here is an example of a saliency map that you -will get in this notebook: - -|image0| - -CLIP ----- - -What Is CLIP? -~~~~~~~~~~~~~ - -CLIP (Contrastive Language–Image Pre-training) is a neural network that -can work with both images and texts. It has been trained to predict -which randomly sampled text snippets are close to a given image, meaning -that a text better describes the image. Here is a visualization of the -pre-training process: - -|image1| `image_source `__ - -To solve the task, CLIP uses two parts: ``Image Encoder`` and -``Text Encoder``. Both parts are used to produce embeddings, which are -vectors of floating-point numbers, for images and texts, respectively. -Given two vectors, one can define and measure the similarity between -them. A popular method to do so is the ``cosine_similarity``, which is -defined as the dot product of the two vectors divided by the product of -their norms: - -.. figure:: https://user-images.githubusercontent.com/29454499/218972165-f61a82f2-9711-4ce6-84b5-58fdd1d80d10.png - :alt: cs - - cs - -The result can range from :math:`-1` to :math:`1`. A value :math:`1` -means that the vectors are similar, :math:`0` means that the vectors are -not “connected” at all, and :math:`-1` is for vectors with somehow -opposite “meaning”. To train CLIP, OpenAI uses samples of texts and -images and organizes them so that the first text corresponds to the -first image in the batch, the second text to the second image, and so -on. Then, cosine similarities are measured between all texts and all -images, and the results are put in a matrix. If the matrix has numbers -close to :math:`1` on a diagonal and close to :math:`0` elsewhere, it -indicates that the network is appropriately trained. - -How to Build a Saliency Map with CLIP? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Providing an image and a text to CLIP returns two vectors. The cosine -similarity between these vectors is calculated, resulting in a number -between :math:`-1` and :math:`1` that indicates whether the text -describes the image or not. The idea is that *some regions of the image -are closer to the text query* than others, and this difference can be -used to build the saliency map. Here is how it can be done: - -1. Compute ``query`` and ``image`` similarity. This will represent the - *neutral value* :math:`s_0` on the ``saliency map``. -2. Get a random ``crop`` of the image. -3. Compute ``crop`` and ``query`` similarity. -4. Subtract the :math:`s_0` from it. If the value is positive, the - ``crop`` is closer to the ``query``, and it should be a red region on - the saliency map. If negative, it should be blue. -5. Update the corresponding region on the ``saliency map``. -6. Repeat steps 2-5 multiple times (``n_iters``). - - -**Table of contents:** - - -- `Initial Implementation with Transformers and - Pytorch <#initial-implementation-with-transformers-and-pytorch>`__ -- `Separate Text and Visual - Processing <#separate-text-and-visual-processing>`__ -- `Convert to OpenVINO™ Intermediate Representation (IR) - Format <#convert-to-openvino-intermediate-representation-ir-format>`__ -- `Inference with OpenVINO™ <#inference-with-openvino>`__ - - - `Select inference device <#select-inference-device>`__ - -- `Accelerate Inference with - AsyncInferQueue <#accelerate-inference-with-asyncinferqueue>`__ -- `Pack the Pipeline into a - Function <#pack-the-pipeline-into-a-function>`__ -- `Interactive demo with Gradio <#interactive-demo-with-gradio>`__ -- `What To Do Next <#what-to-do-next>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. |image0| image:: https://user-images.githubusercontent.com/29454499/218967961-9858efd5-fff2-4eb0-bde9-60852f4b31cb.JPG -.. |image1| image:: https://openaiassets.blob.core.windows.net/$web/clip/draft/20210104b/overview-a.svg - -Initial Implementation with Transformers and Pytorch ----------------------------------------------------- - - - -.. code:: ipython3 - - # Install requirements - %pip install -q "openvino>=2023.1.0" - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu transformers "numpy<2" "torch>=2.1" "gradio>=4.19" "matplotlib>=3.4" - -.. code:: ipython3 - - from pathlib import Path - from typing import Tuple, Union, Optional - import requests - - from matplotlib import colors - import matplotlib.pyplot as plt - import numpy as np - import torch - import tqdm - from PIL import Image - from transformers import CLIPModel, CLIPProcessor - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("clip-language-saliency-map.ipynb") - -To get the CLIP model, you will use the ``transformers`` library and the -official ``openai/clip-vit-base-patch16`` from OpenAI. You can use any -CLIP model from the HuggingFace Hub by simply replacing a model -checkpoint in the cell below. - -There are several preprocessing steps required to get text and image -data to the model. Images have to be resized, cropped, and normalized, -and text must be split into tokens and swapped by token IDs. To do that, -you will use ``CLIPProcessor``, which encapsulates all the preprocessing -steps. - -.. code:: ipython3 - - model_checkpoint = "openai/clip-vit-base-patch16" - - model = CLIPModel.from_pretrained(model_checkpoint).eval() - processor = CLIPProcessor.from_pretrained(model_checkpoint) - -Let us write helper functions first. You will generate crop coordinates -and size with ``get_random_crop_params``, and get the actual crop with -``get_crop_image``. To update the saliency map with the calculated -similarity, you will use ``update_saliency_map``. A -``cosine_similarity`` function is just a code representation of the -formula above. - -.. code:: ipython3 - - def get_random_crop_params(image_height: int, image_width: int, min_crop_size: int) -> Tuple[int, int, int, int]: - crop_size = np.random.randint(min_crop_size, min(image_height, image_width)) - x = np.random.randint(image_width - crop_size + 1) - y = np.random.randint(image_height - crop_size + 1) - return x, y, crop_size - - - def get_cropped_image(im_tensor: np.array, x: int, y: int, crop_size: int) -> np.array: - return im_tensor[y : y + crop_size, x : x + crop_size, ...] - - - def update_saliency_map(saliency_map: np.array, similarity: float, x: int, y: int, crop_size: int) -> None: - saliency_map[ - y : y + crop_size, - x : x + crop_size, - ] += similarity - - - def cosine_similarity(one: Union[np.ndarray, torch.Tensor], other: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]: - return one @ other.T / (np.linalg.norm(one) * np.linalg.norm(other)) - -Parameters to be defined: - -- ``n_iters`` - number of times the procedure will be repeated. Larger - is better, but will require more time to inference -- ``min_crop_size`` - minimum size of the crop window. A smaller size - will increase the resolution of the saliency map but may require more - iterations -- ``query`` - text that will be used to query the image -- ``image`` - the actual image that will be queried. You will download - the image from a link - -The image at the beginning was acquired with ``n_iters=2000`` and -``min_crop_size=50``. You will start with the lower number of inferences -to get the result faster. It is recommended to experiment with the -parameters at the end, when you get an optimized model. - -.. code:: ipython3 - - n_iters = 300 - min_crop_size = 50 - - query = "Who developed the Theory of General Relativity?" - image_path = Path("example.jpg") - - if not image_path.exists(): - r = requests.get("https://github.com/user-attachments/assets/a5bedef2-e915-4286-bcc9-d599083a99a6") - - with image_path.open("wb") as f: - f.write(r.content) - image = Image.open(image_path) - im_tensor = np.array(image) - - x_dim, y_dim = image.size - -Given the ``model`` and ``processor``, the actual inference is simple: -transform the text and image into combined ``inputs`` and pass it to the -model: - -.. code:: ipython3 - - inputs = processor(text=[query], images=[im_tensor], return_tensors="pt") - with torch.no_grad(): - results = model(**inputs) - results.keys() - - - - -.. parsed-literal:: - - odict_keys(['logits_per_image', 'logits_per_text', 'text_embeds', 'image_embeds', 'text_model_output', 'vision_model_output']) - - - -The model produces several outputs, but for your application, you are -interested in ``text_embeds`` and ``image_embeds``, which are the -vectors for text and image, respectively. Now, you can calculate -``initial_similarity`` between the ``query`` and the ``image``. You also -initialize a saliency map. Numbers in the comments correspond to the -items in the “How To Build a Saliency Map With CLIP?” list above. - -.. code:: ipython3 - - initial_similarity = cosine_similarity(results.text_embeds, results.image_embeds).item() # 1. Computing query and image similarity - saliency_map = np.zeros((y_dim, x_dim)) - - for _ in tqdm.notebook.tqdm(range(n_iters)): # 6. Setting number of the procedure iterations - x, y, crop_size = get_random_crop_params(y_dim, x_dim, min_crop_size) - im_crop = get_cropped_image(im_tensor, x, y, crop_size) # 2. Getting a random crop of the image - - inputs = processor(text=[query], images=[im_crop], return_tensors="pt") - with torch.no_grad(): - results = model(**inputs) # 3. Computing crop and query similarity - - similarity = ( - cosine_similarity(results.text_embeds, results.image_embeds).item() - initial_similarity - ) # 4. Subtracting query and image similarity from crop and query similarity - update_saliency_map(saliency_map, similarity, x, y, crop_size) # 5. Updating the region on the saliency map - - - -.. parsed-literal:: - - 0%| | 0/300 [00:00 None: - fig = plt.figure(dpi=150) - plt.imshow(image_tensor) - plt.imshow( - saliency_map, - norm=colors.TwoSlopeNorm(vcenter=0), - cmap="jet", - alpha=0.5, # make saliency map trasparent to see original picture - ) - if query: - plt.title(f'Query: "{query}"') - plt.axis("off") - return fig - - - plot_saliency_map(im_tensor, saliency_map, query); - - - -.. image:: clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_17_0.png - - -Separate Text and Visual Processing ------------------------------------ - - - -The code above is functional, but there are some repeated computations -that can be avoided. The text embedding can be computed once because it -does not depend on the input image. This separation will also be useful -in the future. The initial preparation will remain the same since you -still need to compute the similarity between the text and the full -image. After that, the ``get_image_features`` method could be used to -obtain embeddings for the cropped images. - -.. code:: ipython3 - - inputs = processor(text=[query], images=[im_tensor], return_tensors="pt") - with torch.no_grad(): - results = model(**inputs) - text_embeds = results.text_embeds # save text embeddings to use them later - - initial_similarity = cosine_similarity(text_embeds, results.image_embeds).item() - saliency_map = np.zeros((y_dim, x_dim)) - - for _ in tqdm.notebook.tqdm(range(n_iters)): - x, y, crop_size = get_random_crop_params(y_dim, x_dim, min_crop_size) - im_crop = get_cropped_image(im_tensor, x, y, crop_size) - - image_inputs = processor(images=[im_crop], return_tensors="pt") # crop preprocessing - with torch.no_grad(): - image_embeds = model.get_image_features(**image_inputs) # calculate image embeddings only - - similarity = cosine_similarity(text_embeds, image_embeds).item() - initial_similarity - update_saliency_map(saliency_map, similarity, x, y, crop_size) - - plot_saliency_map(im_tensor, saliency_map, query); - - - -.. parsed-literal:: - - 0%| | 0/300 [00:00`__ - -.. code:: ipython3 - - import openvino as ov - - model_name = model_checkpoint.split("/")[-1] - - model.config.torchscript = True - model.forward = model.get_text_features - text_ov_model = ov.convert_model( - model, - example_input={ - "input_ids": inputs.input_ids, - "attention_mask": inputs.attention_mask, - }, - ) - - # get image size after preprocessing from the processor - crops_info = processor.image_processor.crop_size.values() if hasattr(processor, "image_processor") else processor.feature_extractor.crop_size.values() - model.forward = model.get_image_features - image_ov_model = ov.convert_model( - model, - example_input={"pixel_values": inputs.pixel_values}, - input=[1, 3, *crops_info], - ) - - ov_dir = Path("ir") - ov_dir.mkdir(exist_ok=True) - text_model_path = ov_dir / f"{model_name}_text.xml" - image_model_path = ov_dir / f"{model_name}_image.xml" - - # write resulting models on disk - ov.save_model(text_ov_model, text_model_path) - ov.save_model(image_ov_model, image_model_path) - - -.. parsed-literal:: - - WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11. - - -.. parsed-literal:: - - [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... - To disable this warning, you can either: - - Avoid using `tokenizers` before the fork if possible - - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) - huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... - To disable this warning, you can either: - - Avoid using `tokenizers` before the fork if possible - - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) - huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... - To disable this warning, you can either: - - Avoid using `tokenizers` before the fork if possible - - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) - - -.. parsed-literal:: - - No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda' - /home/ea/work/ov_venv/lib/python3.8/site-packages/transformers/models/clip/modeling_clip.py:287: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): - /home/ea/work/ov_venv/lib/python3.8/site-packages/transformers/models/clip/modeling_clip.py:295: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len): - /home/ea/work/ov_venv/lib/python3.8/site-packages/transformers/models/clip/modeling_clip.py:304: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if attention_mask.size() != (bsz, 1, tgt_len, src_len): - /home/ea/work/ov_venv/lib/python3.8/site-packages/transformers/models/clip/modeling_clip.py:327: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): - - -Now, you have two separate models for text and images, stored on disk -and ready to be loaded and inferred with OpenVINO™. - -Inference with OpenVINO™ ------------------------- - - - -1. Create an instance of the ``Core`` object that will handle any - interaction with OpenVINO runtime for you. -2. Use the ``core.read_model`` method to load the model into memory. -3. Compile the model with the ``core.compile_model`` method for a - particular device to apply device-specific optimizations. -4. Use the compiled model for inference. - -.. code:: ipython3 - - core = ov.Core() - - text_model = core.read_model(text_model_path) - image_model = core.read_model(image_model_path) - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - text_model = core.compile_model(model=text_model, device_name=device.value) - image_model = core.compile_model(model=image_model, device_name=device.value) - -OpenVINO supports ``numpy.ndarray`` as an input type, so you change the -``return_tensors`` to ``np``. You also convert a transformers’ -``BatchEncoding`` object to a python dictionary with input names as keys -and input tensors for values. - -Once you have a compiled model, the inference is similar to Pytorch - a -compiled model is callable. Just pass input data to it. Inference -results are stored in the dictionary. Once you have a compiled model, -the inference process is mostly similar. - -.. code:: ipython3 - - text_inputs = dict(processor(text=[query], images=[im_tensor], return_tensors="np")) - image_inputs = text_inputs.pop("pixel_values") - - text_embeds = text_model(text_inputs)[0] - image_embeds = image_model(image_inputs)[0] - - initial_similarity = cosine_similarity(text_embeds, image_embeds) - saliency_map = np.zeros((y_dim, x_dim)) - - for _ in tqdm.notebook.tqdm(range(n_iters)): - x, y, crop_size = get_random_crop_params(y_dim, x_dim, min_crop_size) - im_crop = get_cropped_image(im_tensor, x, y, crop_size) - - image_inputs = processor(images=[im_crop], return_tensors="np").pixel_values - image_embeds = image_model(image_inputs)[image_model.output()] - - similarity = cosine_similarity(text_embeds, image_embeds) - initial_similarity - update_saliency_map(saliency_map, similarity, x, y, crop_size) - - plot_saliency_map(im_tensor, saliency_map, query); - - - -.. parsed-literal:: - - 0%| | 0/300 [00:00 None: - pbar = user_data.pop("pbar") - - image_embeds = infer_request.get_output_tensor().data - similarity = cosine_similarity(user_data.pop("text_embeds"), image_embeds) - user_data.pop("initial_similarity") - update_saliency_map(**user_data, similarity=similarity) - - pbar.update(1) # update the progress bar - - - infer_queue = ov.AsyncInferQueue(image_model) - infer_queue.set_callback(completion_callback) - -.. code:: ipython3 - - def infer( - im_tensor, - x_dim, - y_dim, - text_embeds, - image_embeds, - initial_similarity, - saliency_map, - query, - n_iters, - min_crop_size, - _tqdm=tqdm.notebook.tqdm, - include_query=True, - ): - with _tqdm(total=n_iters) as pbar: - for _ in range(n_iters): - x, y, crop_size = get_random_crop_params(y_dim, x_dim, min_crop_size) - im_crop = get_cropped_image(im_tensor, x, y, crop_size) - - image_inputs = processor(images=[im_crop], return_tensors="np") - - # push data to the queue - infer_queue.start_async( - # pass inference data as usual - image_inputs.pixel_values, - # the data that will be passed to the callback after the inference complete - { - "text_embeds": text_embeds, - "saliency_map": saliency_map, - "initial_similarity": initial_similarity, - "x": x, - "y": y, - "crop_size": crop_size, - "pbar": pbar, - }, - ) - - # after you pushed all data to the queue you wait until all callbacks finished - infer_queue.wait_all() - - return plot_saliency_map(im_tensor, saliency_map, query if include_query else None) - - - infer( - im_tensor, - x_dim, - y_dim, - text_embeds, - image_embeds, - initial_similarity, - saliency_map, - query, - n_iters, - min_crop_size, - _tqdm=tqdm.notebook.tqdm, - include_query=True, - ); - - - -.. parsed-literal:: - - 0%| | 0/300 [00:00 None: - try: - image_bytes = requests.get(image_link, stream=True).raw - except requests.RequestException as e: - print(f"Cannot load image from link: {image_link}\nException: {e}") - return - - image = Image.open(image_bytes) - image = image.convert("RGB") # remove transparency channel or convert grayscale 1 channel to 3 channels - - build_saliency_map(image, query, n_iters, min_crop_size) - - - -.. parsed-literal:: - - interactive(children=(Text(value='', continuous_update=False, description='image_link'), Text(value='', contin… - - -The second version will enable loading the image from your computer. - -.. code:: ipython3 - - import io - - - load_file_widget = widgets.FileUpload( - accept="image/*", - multiple=False, - description="Image file", - ) - - - @widgets.interact_manual( - file=load_file_widget, - query="", - n_iters=n_iters_widget, - min_crop_size=min_crop_size_widget, - ) - def build_saliency_map_from_file( - file: Path, - query: str = "", - n_iters: int = 2000, - min_crop_size: int = 50, - ) -> None: - image_bytes = io.BytesIO(file[0]["content"]) - try: - image = Image.open(image_bytes) - except Exception as e: - print(f"Cannot load the image: {e}") - return - - image = image.convert("RGB") - - build_saliency_map(image, query, n_iters, min_crop_size) - - - -.. parsed-literal:: - - interactive(children=(FileUpload(value=(), accept='image/*', description='Image file'), Text(value='', continu… - - -Interactive demo with Gradio ----------------------------- - - - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/clip-language-saliency-map/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(build_saliency_map) - - try: - demo.queue().launch(debug=False) - except Exception: - demo.queue().launch(share=True, debug=False) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ - - -.. parsed-literal:: - - Running on local URL: http://127.0.0.1:7860 - - To create a public link, set `share=True` in `launch()`. - - - - - - - - -What To Do Next ---------------- - - - -Now that you have a convenient interface and accelerated inference, you -can explore the CLIP capabilities further. For example: - -- Can CLIP read? Can it detect text regions in general and specific - words on the image? -- Which famous people and places does CLIP know? -- Can CLIP identify places on a map? Or planets, stars, and - constellations? -- Explore different CLIP models from HuggingFace Hub: just change the - ``model_checkpoint`` at the beginning of the notebook. -- Add batch processing to the pipeline: modify - ``get_random_crop_params``, ``get_cropped_image`` and - ``update_saliency_map`` functions to process multiple crop images at - once and accelerate the pipeline even more. -- Optimize models with - `NNCF `__ - to get further acceleration. You can find example how to quantize - CLIP model in `this - notebook `__ diff --git a/docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_15_0.png b/docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_15_0.png deleted file mode 100644 index bac75bcf983091..00000000000000 --- a/docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_15_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:80b23dd6a69c615e3d374ecfce473bbbfa1491c0e538a391d1c2ca88ea53ca37 -size 74041 diff --git a/docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_17_0.png b/docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_17_0.png deleted file mode 100644 index 4beb6c5c300ecb..00000000000000 --- a/docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_17_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:754dcfb3e248c5ab823db8785e22d9c59a3571a0b6ccd6d028dab59446345797 -size 473017 diff --git a/docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_19_1.png b/docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_19_1.png deleted file mode 100644 index 8d966d74bee754..00000000000000 --- a/docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_19_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2bf4b05549342476c8b2e37070ff4e8e6b141ae8d3c20c1ac2b18babbf1208d4 -size 472958 diff --git a/docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_29_1.png b/docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_29_1.png deleted file mode 100644 index ce4a0fb00a9f06..00000000000000 --- a/docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_29_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a5a1f5350d5312a5a71439c08b6fdab7fb2cd48ca31945e5b96e500ca33ec02a -size 475731 diff --git a/docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_35_1.png b/docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_35_1.png deleted file mode 100644 index dbaf3943e91332..00000000000000 --- a/docs/notebooks/clip-language-saliency-map-with-output_files/clip-language-saliency-map-with-output_35_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ef15ba5b5e6ade63354a852c28352cdcb5bce50f0014ac67d00a909fc2704723 -size 471008 diff --git a/docs/notebooks/clip-zero-shot-classification-with-output.rst b/docs/notebooks/clip-zero-shot-classification-with-output.rst deleted file mode 100644 index 58ffb6a32ad534..00000000000000 --- a/docs/notebooks/clip-zero-shot-classification-with-output.rst +++ /dev/null @@ -1,793 +0,0 @@ -Zero-shot Image Classification with OpenAI CLIP and OpenVINO™ -============================================================= - -Zero-shot image classification is a computer vision task to classify -images into one of several classes without any prior training or -knowledge of the classes. - -.. figure:: https://user-images.githubusercontent.com/29454499/207773481-d77cacf8-6cdc-4765-a31b-a1669476d620.png - :alt: zero-shot-pipeline - - zero-shot-pipeline - -`\**image -source\* `__ - -Zero-shot learning resolves several challenges in image retrieval -systems. For example, with the rapid growth of categories on the web, it -is challenging to index images based on unseen categories. We can -associate unseen categories to images with zero-shot learning by -exploiting attributes to model’s relationship between visual features -and labels. In this tutorial, we will use the `OpenAI -CLIP `__ model to perform zero-shot -image classification. Additionally, the notebook demonstrates how to -optimize the model using -`NNCF `__. - -The notebook contains the following steps: - -1. Download the model. -2. Instantiate the PyTorch model. -3. Convert model to OpenVINO IR, using the model conversion API. -4. Run CLIP with OpenVINO. -5. Quantize the converted model with NNCF. -6. Check the quantized model inference result. -7. Compare model size of converted and quantized models. -8. Compare performance of converted and quantized models. -9. Launch interactive demo - - -**Table of contents:** - - -- `Instantiate model <#instantiate-model>`__ -- `Run PyTorch model inference <#run-pytorch-model-inference>`__ -- `Convert model to OpenVINO Intermediate Representation (IR) - format. <#convert-model-to-openvino-intermediate-representation-ir-format->`__ -- `Run OpenVINO model <#run-openvino-model>`__ - - - `Select inference device <#select-inference-device>`__ - -- `Quantize model to INT8 using - NNCF <#quantize-model-to-int8-using-nncf>`__ - - - `Prepare datasets <#prepare-datasets>`__ - - `Perform quantization <#perform-quantization>`__ - - `Run quantized OpenVINO model <#run-quantized-openvino-model>`__ - - `Compare File Size <#compare-file-size>`__ - - `Compare inference time of the FP16 IR and quantized - models <#compare-inference-time-of-the-fp16-ir-and-quantized-models>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Instantiate model ------------------ - - - -CLIP (Contrastive Language-Image Pre-Training) is a neural network -trained on various (image, text) pairs. It can be instructed in natural -language to predict the most relevant text snippet, given an image, -without directly optimizing for the task. CLIP uses a -`ViT `__ like transformer to get -visual features and a causal language model to get the text features. -The text and visual features are then projected into a latent space with -identical dimensions. The dot product between the projected image and -text features is then used as a similarity score. - -.. figure:: https://raw.githubusercontent.com/openai/CLIP/main/CLIP.png - :alt: clip - - clip - -`\**image_source\* `__ - -You can find more information about this model in the `research -paper `__, `OpenAI -blog `__, `model -card `__ and -GitHub `repository `__. - -In this notebook, we will use -`openai/clip-vit-base-patch16 `__, -available via Hugging Face Transformers, but the same steps are -applicable for other CLIP family models. - -First, we need to create ``CLIPModel`` class object and initialize it -with model configuration and weights, using ``from_pretrained`` method. -The model will be automatically downloaded from Hugging Face Hub and -cached for the next usage. ``CLIPProcessor`` class is a wrapper for -input data preprocessing. It includes both encoding the text using -tokenizer and preparing the images. - -.. code:: ipython3 - - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "gradio>=4.19" "matplotlib>=3.4" "openvino>=2023.1.0" "transformers[torch]>=4.30" "datasets" "nncf>=2.6.0" "torch>=2.1" Pillow - - from pathlib import Path - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("clip-zero-shot-classification.ipynb") - -.. code:: ipython3 - - from transformers import CLIPProcessor, CLIPModel - - # load pre-trained model - model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16") - # load preprocessor for model input - processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16") - -.. code:: ipython3 - - from typing import List - import matplotlib.pyplot as plt - import numpy as np - from PIL import Image - - - def visualize_result(image: Image, labels: List[str], probs: np.ndarray, top: int = 5): - """ - Utility function for visualization classification results - params: - image: input image - labels: list of classification labels - probs: model predicted softmaxed probabilities for each label - top: number of the highest probability results for visualization - returns: - None - """ - plt.figure(figsize=(64, 64)) - top_labels = np.argsort(-probs)[: min(top, probs.shape[0])] - top_probs = probs[top_labels] - plt.subplot(8, 8, 1) - plt.imshow(image) - plt.axis("off") - - plt.subplot(8, 8, 2) - y = np.arange(top_probs.shape[-1]) - plt.grid() - plt.barh(y, top_probs) - plt.gca().invert_yaxis() - plt.gca().set_axisbelow(True) - plt.yticks(y, [labels[index] for index in top_labels]) - plt.xlabel("probability") - -Run PyTorch model inference ---------------------------- - - - -To perform classification, define labels and load an image in RGB -format. To give the model wider text context and improve guidance, we -extend the labels description using the template “This is a photo of a”. -Both the list of label descriptions and image should be passed through -the processor to obtain a dictionary with input data in the -model-specific format. The model predicts an image-text similarity score -in raw logits format, which can be normalized to the ``[0, 1]`` range -using the ``softmax`` function. Then, we select labels with the highest -similarity score for the final result. - -.. code:: ipython3 - - import requests - from pathlib import Path - - - sample_path = Path("data/coco.jpg") - sample_path.parent.mkdir(parents=True, exist_ok=True) - - if not sample_path.exists(): - r = requests.get("https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco.jpg") - - with sample_path.open("wb") as f: - f.write(r.content) - - image = Image.open(sample_path) - - input_labels = [ - "cat", - "dog", - "wolf", - "tiger", - "man", - "horse", - "frog", - "tree", - "house", - "computer", - ] - text_descriptions = [f"This is a photo of a {label}" for label in input_labels] - - inputs = processor(text=text_descriptions, images=[image], return_tensors="pt", padding=True) - - results = model(**inputs) - logits_per_image = results["logits_per_image"] # this is the image-text similarity score - probs = logits_per_image.softmax(dim=1).detach().numpy() # we can take the softmax to get the label probabilities - visualize_result(image, input_labels, probs[0]) - - - -.. image:: clip-zero-shot-classification-with-output_files/clip-zero-shot-classification-with-output_6_0.png - - -Convert model to OpenVINO Intermediate Representation (IR) format. ------------------------------------------------------------------- - - - -For best results with OpenVINO, it is recommended to convert the model -to OpenVINO IR format. OpenVINO supports PyTorch via Model conversion -API. To convert the PyTorch model to OpenVINO IR format we will use -``ov.convert_model`` of `model conversion -API `__. -The ``ov.convert_model`` Python function returns an OpenVINO Model -object ready to load on the device and start making predictions. We can -save it on disk for the next usage with ``ov.save_model``. - -.. code:: ipython3 - - import openvino as ov - - fp16_model_path = Path("clip-vit-base-patch16.xml") - model.config.torchscript = True - - if not fp16_model_path.exists(): - ov_model = ov.convert_model(model, example_input=dict(inputs)) - ov.save_model(ov_model, fp16_model_path) - -Run OpenVINO model ------------------- - - - -The steps for making predictions with the OpenVINO CLIP model are -similar to the PyTorch model. Let us check the model result using the -same input data from the example above with PyTorch. - -.. code:: ipython3 - - from scipy.special import softmax - - # create OpenVINO core object instance - core = ov.Core() - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - # compile model for loading on device - compiled_model = core.compile_model(fp16_model_path, device.value) - # run inference on preprocessed data and get image-text similarity score - ov_logits_per_image = compiled_model(dict(inputs))[0] - # perform softmax on score - probs = softmax(ov_logits_per_image, axis=1) - # visualize prediction - visualize_result(image, input_labels, probs[0]) - - - -.. image:: clip-zero-shot-classification-with-output_files/clip-zero-shot-classification-with-output_13_0.png - - -Great! Looks like we got the same result. - -Quantize model to INT8 using NNCF ---------------------------------- - - - -The goal of this part of tutorial is to demonstrate how to speed up the -model by applying 8-bit post-training quantization from -`NNCF `__ (Neural Network -Compression Framework) and infer quantized model via OpenVINO™ Toolkit. -`NNCF `__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. The optimization process contains the following steps: - -1. Prepare quantization dataset -2. Quantize the converted OpenVINO model with NNCF. -3. Check the model result using the same input data like we use. -4. Compare model size of converted and quantized models. -5. Compare performance of converted and quantized models. - -.. - - **Note:** quantization process may require additional time and memory - for performing. You can disable it using widget below: - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - to_quantize = quantization_widget() - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -.. code:: ipython3 - - # Fetch skip_kernel_extension module - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - %load_ext skip_kernel_extension - -Prepare datasets -~~~~~~~~~~~~~~~~ - - - -The `Conceptual -Captions `__ dataset -consisting of ~3.3M images annotated with captions is used to quantize -model. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import requests - from io import BytesIO - import numpy as np - from PIL import Image - from requests.packages.urllib3.exceptions import InsecureRequestWarning - requests.packages.urllib3.disable_warnings(InsecureRequestWarning) - - max_length = model.config.text_config.max_position_embeddings - - def check_text_data(data): - """ - Check if the given data is text-based. - """ - if isinstance(data, str): - return True - if isinstance(data, list): - return all(isinstance(x, str) for x in data) - return False - - def get_pil_from_url(url): - """ - Downloads and converts an image from a URL to a PIL Image object. - """ - response = requests.get(url, verify=False, timeout=20) - image = Image.open(BytesIO(response.content)) - return image.convert("RGB") - - def collate_fn(example, image_column="image_url", text_column="caption"): - """ - Preprocesses an example by loading and transforming image and text data. - Checks if the text data in the example is valid by calling the `check_text_data` function. - Downloads the image specified by the URL in the image_column by calling the `get_pil_from_url` function. - If there is any error during the download process, returns None. - Returns the preprocessed inputs with transformed image and text data. - """ - assert len(example) == 1 - example = example[0] - - if not check_text_data(example[text_column]): - raise ValueError("Text data is not valid") - - url = example[image_column] - try: - image = get_pil_from_url(url) - h, w = image.size - if h == 1 or w == 1: - return None - except Exception: - return None - - inputs = processor(text=example[text_column], images=[image], return_tensors="pt", padding=True) - if inputs['input_ids'].shape[1] > max_length: - return None - return inputs - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import torch - from datasets import load_dataset - from tqdm.notebook import tqdm - - def prepare_calibration_data(dataloader, init_steps): - """ - This function prepares calibration data from a dataloader for a specified number of initialization steps. - It iterates over the dataloader, fetching batches and storing the relevant data. - """ - data = [] - print(f"Fetching {init_steps} samples for the initialization...") - with tqdm(total=init_steps) as pbar: - for batch in dataloader: - if len(data) == init_steps: - break - if batch: - pbar.update(1) - with torch.no_grad(): - data.append( - { - "pixel_values": batch["pixel_values"].to("cpu"), - "input_ids": batch["input_ids"].to("cpu"), - "attention_mask": batch["attention_mask"].to("cpu") - } - ) - return data - - - def prepare_dataset(opt_init_steps=50, max_train_samples=1000): - """ - Prepares a vision-text dataset for quantization. - """ - dataset = load_dataset("google-research-datasets/conceptual_captions", trust_remote_code=True) - train_dataset = dataset["train"].shuffle(seed=42) - dataloader = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1) - calibration_data = prepare_calibration_data(dataloader, opt_init_steps) - return calibration_data - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import logging - import nncf - - core = ov.Core() - - nncf.set_log_level(logging.ERROR) - - int8_model_path = 'clip-vit-base-patch16_int8.xml' - calibration_data = prepare_dataset() - ov_model = core.read_model(fp16_model_path) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -.. parsed-literal:: - - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/datasets/load.py:1429: FutureWarning: The repository for conceptual_captions contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conceptual_captions - You can avoid this message in future by passing the argument `trust_remote_code=True`. - Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`. - warnings.warn( - - -.. parsed-literal:: - - Fetching 50 samples for the initialization... - - - -.. parsed-literal:: - - 0%| | 0/50 [00:00`__ -paper. It provides a framework that enables support for various spatial -contexts such as a depth map, a segmentation map, a scribble, and key -points that can serve as additional conditionings to Diffusion models -such as Stable Diffusion. - -This notebook explores ControlNet in depth, especially a new technique -for imparting high levels of control over the shape of synthesized -images. It demonstrates how to run it, using OpenVINO. An additional -part demonstrates how to run quantization with -`NNCF `__ to speed up -pipeline. Let us get “controlling”! - -Background ----------- - -Stable Diffusion -~~~~~~~~~~~~~~~~ - -`Stable Diffusion `__ is a -text-to-image latent diffusion model created by researchers and -engineers from CompVis, Stability AI, and LAION. Diffusion models as -mentioned above can generate high-quality images. Stable Diffusion is -based on a particular type of diffusion model called Latent Diffusion, -proposed in `High-Resolution Image Synthesis with Latent Diffusion -Models `__ paper. Generally speaking, -diffusion models are machine learning systems that are trained to -denoise random Gaussian noise step by step, to get to a sample of -interest, such as an image. Diffusion models have been shown to achieve -state-of-the-art results for generating image data. But one downside of -diffusion models is that the reverse denoising process is slow because -of its repeated, sequential nature. In addition, these models consume a -lot of memory because they operate in pixel space, which becomes huge -when generating high-resolution images. Latent diffusion can reduce the -memory and compute complexity by applying the diffusion process over a -lower dimensional latent space, instead of using the actual pixel space. -This is the key difference between standard diffusion and latent -diffusion models: in latent diffusion, the model is trained to generate -latent (compressed) representations of the images. - -There are three main components in latent diffusion: - -- A text-encoder, for example `CLIP’s Text - Encoder `__ - for creation condition to generate image from text prompt. -- A U-Net for step-by-step denoising latent image representation. -- An autoencoder (VAE) for encoding input image to latent space (if - required) and decoding latent space to image back after generation. - -For more details regarding Stable Diffusion work, refer to the `project -website `__. -There is a tutorial for Stable Diffusion Text-to-Image generation with -OpenVINO, see the following -`notebook `__. - -ControlNet -~~~~~~~~~~ - -ControlNet is a neural network structure to control diffusion models by -adding extra conditions. Using this new framework, we can capture a -scene, structure, object, or subject pose from an inputted image, and -then transfer that quality to the generation process. In practice, this -enables the model to completely retain the original input shape, and -create a novel image that conserves the shape, pose, or outline while -using the novel features from the inputted prompt. - -.. figure:: https://raw.githubusercontent.com/lllyasviel/ControlNet/main/github_page/he.png - :alt: controlnet block - - controlnet block - -Functionally, ControlNet operates by wrapping around an image synthesis -process to impart attention to the shape required to operate the model -using either its inbuilt prediction or one of many additional annotator -models. Referring to the diagram above, we can see, on a rudimentary -level, how ControlNet uses a trainable copy in conjunction with the -original network to modify the final output with respect to the shape of -the input control source. - -By repeating the above simple structure 14 times, we can control stable -diffusion in the following way: - -.. figure:: https://raw.githubusercontent.com/lllyasviel/ControlNet/main/github_page/sd.png - :alt: sd + controlnet - - sd + controlnet - -The input is simultaneously passed through the SD blocks, represented on -the left, while simultaneously being processed by the ControlNet blocks -on the right. This process is almost the same during encoding. When -denoising the image, at each step the SD decoder blocks will receive -control adjustments from the parallel processing path from ControlNet. - -In the end, we are left with a very similar image synthesis pipeline -with an additional control added for the shape of the output features in -the final image. - -Training ControlNet consists of the following steps: - -1. Cloning the pre-trained parameters of a Diffusion model, such as - Stable Diffusion’s latent UNet, (referred to as “trainable copy”) - while also maintaining the pre-trained parameters separately (”locked - copy”). It is done so that the locked parameter copy can preserve the - vast knowledge learned from a large dataset, whereas the trainable - copy is employed to learn task-specific aspects. -2. The trainable and locked copies of the parameters are connected via - “zero convolution” layers (see here for more information) which are - optimized as a part of the ControlNet framework. This is a training - trick to preserve the semantics already learned by a frozen model as - the new conditions are trained. - -The process of extracting specific information from the input image is -called an annotation. ControlNet comes pre-packaged with compatibility -with several annotators-models that help it to identify the shape/form -of the target in the image: - -- Canny Edge Detection -- M-LSD Lines -- HED Boundary -- Scribbles -- Normal Map -- Human Pose Estimation -- Semantic Segmentation -- Depth Estimation - -This tutorial focuses mainly on conditioning by pose. However, the -discussed steps are also applicable to other annotation modes. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Instantiating Generation - Pipeline <#instantiating-generation-pipeline>`__ - - - `ControlNet in Diffusers - library <#controlnet-in-diffusers-library>`__ - - `OpenPose <#openpose>`__ - -- `Convert models to OpenVINO Intermediate representation (IR) - format <#convert-models-to-openvino-intermediate-representation-ir-format>`__ - - - `OpenPose conversion <#openpose-conversion>`__ - -- `Select inference device <#select-inference-device>`__ - - - `ControlNet conversion <#controlnet-conversion>`__ - - `UNet conversion <#unet-conversion>`__ - - `Text Encoder <#text-encoder>`__ - - `VAE Decoder conversion <#vae-decoder-conversion>`__ - -- `Prepare Inference pipeline <#prepare-inference-pipeline>`__ -- `Running Text-to-Image Generation with ControlNet Conditioning and - OpenVINO <#running-text-to-image-generation-with-controlnet-conditioning-and-openvino>`__ -- `Select inference device for Stable Diffusion - pipeline <#select-inference-device-for-stable-diffusion-pipeline>`__ -- `Quantization <#quantization>`__ - - - `Prepare calibration datasets <#prepare-calibration-datasets>`__ - - `Run quantization <#run-quantization>`__ - - `Compare model file sizes <#compare-model-file-sizes>`__ - - `Compare inference time of the FP16 and INT8 - pipelines <#compare-inference-time-of-the-fp16-and-int8-pipelines>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import requests - from pathlib import Path - - utility_files = ["notebook_utils.py", "pip_helper.py"] - - for utility in utility_files: - if not Path(utility).exists(): - r = requests.get(f"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/{utility}") - with open(utility, "w") as f: - f.write(r.text) - - - from pip_helper import pip_install - - pip_install("torch>=2.1", "torchvision", "--extra-index-url", "https://download.pytorch.org/whl/cpu") - pip_install( - "diffusers>=0.14.0", - "matplotlib>=3.4", - "transformers>=4.30.2", - "einops", - "timm", - "gradio>=3.36", - "datasets>=2.14.6", - "nncf>=2.7.0", - "opencv-python", - "scipy", - "filelock", - "scikit-image", - "--extra-index-url", - "https://download.pytorch.org/whl/cpu", - ) - pip_install("--no-deps", "controlnet-aux>=0.0.6") - pip_install("openvino>=2023.1.0") - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("controlnet-stable-diffusion.ipynb") - -Instantiating Generation Pipeline ---------------------------------- - - - -ControlNet in Diffusers library -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -For working with Stable Diffusion and ControlNet models, we will use -Hugging Face `Diffusers `__ -library. To experiment with ControlNet, Diffusers exposes the -`StableDiffusionControlNetPipeline `__ -similar to the `other Diffusers -pipelines `__. -Central to the ``StableDiffusionControlNetPipeline`` is the -``controlnet`` argument which enables providing a particularly trained -`ControlNetModel `__ -instance while keeping the pre-trained diffusion model weights the same. -The code below demonstrates how to create -``StableDiffusionControlNetPipeline``, using the ``controlnet-openpose`` -controlnet model and ``stable-diffusion-v1-5``: - -.. code:: ipython3 - - import torch - from diffusers import StableDiffusionControlNetPipeline, ControlNetModel - - controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_openpose", torch_dtype=torch.float32) - pipe = StableDiffusionControlNetPipeline.from_pretrained("botp/stable-diffusion-v1-5", controlnet=controlnet) - -OpenPose -~~~~~~~~ - - - -Annotation is an important part of working with ControlNet. -`OpenPose `__ -is a fast keypoint detection model that can extract human poses like -positions of hands, legs, and head. Below is the ControlNet workflow -using OpenPose. Keypoints are extracted from the input image using -OpenPose and saved as a control map containing the positions of -keypoints. It is then fed to Stable Diffusion as an extra conditioning -together with the text prompt. Images are generated based on these two -conditionings. - -.. figure:: https://user-images.githubusercontent.com/29454499/224248986-eedf6492-dd7a-402b-b65d-36de952094ec.png - :alt: controlnet-openpose-pipe - - controlnet-openpose-pipe - -The code below demonstrates how to instantiate the OpenPose model. - -.. code:: ipython3 - - from controlnet_aux import OpenposeDetector - - pose_estimator = OpenposeDetector.from_pretrained("lllyasviel/ControlNet") - -Now, let us check its result on example image: - -.. code:: ipython3 - - from PIL import Image - import matplotlib.pyplot as plt - import numpy as np - from notebook_utils import download_file - - example_url = "https://user-images.githubusercontent.com/29454499/224540208-c172c92a-9714-4a7b-857a-b1e54b4d4791.jpg" - - image_path = Path("example_image.jpg") - if not image_path.exists(): - download_file(example_url, filename="example_image.jpg") - - img = Image.open(image_path) - pose = pose_estimator(img) - - - def visualize_pose_results( - orig_img: Image.Image, - skeleton_img: Image.Image, - left_title: str = "Original image", - right_title: str = "Pose", - ): - """ - Helper function for pose estimationresults visualization - - Parameters: - orig_img (Image.Image): original image - skeleton_img (Image.Image): processed image with body keypoints - left_title (str): title for the left image - right_title (str): title for the right image - Returns: - fig (matplotlib.pyplot.Figure): matplotlib generated figure contains drawing result - """ - orig_img = orig_img.resize(skeleton_img.size) - im_w, im_h = orig_img.size - is_horizontal = im_h <= im_w - figsize = (20, 10) if is_horizontal else (10, 20) - fig, axs = plt.subplots( - 2 if is_horizontal else 1, - 1 if is_horizontal else 2, - figsize=figsize, - sharex="all", - sharey="all", - ) - fig.patch.set_facecolor("white") - list_axes = list(axs.flat) - for a in list_axes: - a.set_xticklabels([]) - a.set_yticklabels([]) - a.get_xaxis().set_visible(False) - a.get_yaxis().set_visible(False) - a.grid(False) - list_axes[0].imshow(np.array(orig_img)) - list_axes[1].imshow(np.array(skeleton_img)) - list_axes[0].set_title(left_title, fontsize=15) - list_axes[1].set_title(right_title, fontsize=15) - fig.subplots_adjust(wspace=0.01 if is_horizontal else 0.00, hspace=0.01 if is_horizontal else 0.1) - fig.tight_layout() - return fig - - - fig = visualize_pose_results(img, pose) - - - -.. image:: controlnet-stable-diffusion-with-output_files/controlnet-stable-diffusion-with-output_8_0.png - - -Convert models to OpenVINO Intermediate representation (IR) format ------------------------------------------------------------------- - - - -Starting from 2023.0 release, OpenVINO supports PyTorch models -conversion directly. We need to provide a model object, input data for -model tracing to ``ov.convert_model`` function to obtain OpenVINO -``ov.Model`` object instance. Model can be saved on disk for next -deployment using ``ov.save_model`` function. - -The pipeline consists of five important parts: - -- OpenPose for obtaining annotation based on an estimated pose. -- ControlNet for conditioning by image annotation. -- Text Encoder for creation condition to generate an image from a text - prompt. -- Unet for step-by-step denoising latent image representation. -- Autoencoder (VAE) for decoding latent space to image. - -Let us convert each part: - -OpenPose conversion -~~~~~~~~~~~~~~~~~~~ - - - -OpenPose model is represented in the pipeline as a wrapper on the -PyTorch model which not only detects poses on an input image but is also -responsible for drawing pose maps. We need to convert only the pose -estimation part, which is located inside the wrapper -``pose_estimator.body_estimation.model``. - -.. code:: ipython3 - - from pathlib import Path - import torch - import openvino as ov - - OPENPOSE_OV_PATH = Path("openpose.xml") - - - def cleanup_torchscript_cache(): - """ - Helper for removing cached model representation - """ - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - - - if not OPENPOSE_OV_PATH.exists(): - with torch.no_grad(): - ov_model = ov.convert_model( - pose_estimator.body_estimation.model, - example_input=torch.zeros([1, 3, 184, 136]), - input=[[1, 3, 184, 136]], - ) - ov.save_model(ov_model, OPENPOSE_OV_PATH) - del ov_model - cleanup_torchscript_cache() - print("OpenPose successfully converted to IR") - else: - print(f"OpenPose will be loaded from {OPENPOSE_OV_PATH}") - - -.. parsed-literal:: - - OpenPose will be loaded from openpose.xml - - -To reuse the original drawing procedure, we replace the PyTorch OpenPose -model with the OpenVINO model, using the following code: - -.. code:: ipython3 - - from collections import namedtuple - - - class OpenPoseOVModel: - """Helper wrapper for OpenPose model inference""" - - def __init__(self, core, model_path, device="AUTO"): - self.core = core - self.model = core.read_model(model_path) - self._device = device - self.compiled_model = core.compile_model(self.model, device) - - def __call__(self, input_tensor: torch.Tensor): - """ - inference step - - Parameters: - input_tensor (torch.Tensor): tensor with prerpcessed input image - Returns: - predicted keypoints heatmaps - """ - h, w = input_tensor.shape[2:] - input_shape = self.model.input(0).shape - if h != input_shape[2] or w != input_shape[3]: - self.reshape_model(h, w) - results = self.compiled_model(input_tensor) - return torch.from_numpy(results[self.compiled_model.output(0)]), torch.from_numpy(results[self.compiled_model.output(1)]) - - def reshape_model(self, height: int, width: int): - """ - helper method for reshaping model to fit input data - - Parameters: - height (int): input tensor height - width (int): input tensor width - Returns: - None - """ - self.model.reshape({0: [1, 3, height, width]}) - self.compiled_model = self.core.compile_model(self.model, self._device) - - def parameters(self): - Device = namedtuple("Device", ["device"]) - return [Device(torch.device("cpu"))] - - - core = ov.Core() - -Select inference device ------------------------ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - ov_openpose = OpenPoseOVModel(core, OPENPOSE_OV_PATH, device=device.value) - pose_estimator.body_estimation.model = ov_openpose - -.. code:: ipython3 - - pose = pose_estimator(img) - fig = visualize_pose_results(img, pose) - - - -.. image:: controlnet-stable-diffusion-with-output_files/controlnet-stable-diffusion-with-output_17_0.png - - -Great! As we can see, it works perfectly. - -ControlNet conversion -~~~~~~~~~~~~~~~~~~~~~ - - - -The ControlNet model accepts the same inputs like UNet in Stable -Diffusion pipeline and additional condition sample - skeleton key points -map predicted by pose estimator: - -- ``sample`` - latent image sample from the previous step, generation - process has not been started yet, so we will use random noise, -- ``timestep`` - current scheduler step, -- ``encoder_hidden_state`` - hidden state of text encoder, -- ``controlnet_cond`` - condition input annotation. - -The output of the model is attention hidden states from down and middle -blocks, which serves additional context for the UNet model. - -.. code:: ipython3 - - import gc - from functools import partial - - inputs = { - "sample": torch.randn((2, 4, 64, 64)), - "timestep": torch.tensor(1), - "encoder_hidden_states": torch.randn((2, 77, 768)), - "controlnet_cond": torch.randn((2, 3, 512, 512)), - } - - input_info = [(name, ov.PartialShape(inp.shape)) for name, inp in inputs.items()] - - CONTROLNET_OV_PATH = Path("controlnet-pose.xml") - controlnet.eval() - with torch.no_grad(): - down_block_res_samples, mid_block_res_sample = controlnet(**inputs, return_dict=False) - - if not CONTROLNET_OV_PATH.exists(): - with torch.no_grad(): - controlnet.forward = partial(controlnet.forward, return_dict=False) - ov_model = ov.convert_model(controlnet, example_input=inputs, input=input_info) - ov.save_model(ov_model, CONTROLNET_OV_PATH) - del ov_model - cleanup_torchscript_cache() - print("ControlNet successfully converted to IR") - else: - print(f"ControlNet will be loaded from {CONTROLNET_OV_PATH}") - - del controlnet - gc.collect() - - -.. parsed-literal:: - - ControlNet will be loaded from controlnet-pose.xml - - - - -.. parsed-literal:: - - 4890 - - - -UNet conversion -~~~~~~~~~~~~~~~ - - - -The process of UNet model conversion remains the same, like for original -Stable Diffusion model, but with respect to the new inputs generated by -ControlNet. - -.. code:: ipython3 - - from typing import Tuple - - UNET_OV_PATH = Path("unet_controlnet.xml") - - dtype_mapping = { - torch.float32: ov.Type.f32, - torch.float64: ov.Type.f64, - torch.int32: ov.Type.i32, - torch.int64: ov.Type.i64, - } - - - class UnetWrapper(torch.nn.Module): - def __init__( - self, - unet, - sample_dtype=torch.float32, - timestep_dtype=torch.int64, - encoder_hidden_states=torch.float32, - down_block_additional_residuals=torch.float32, - mid_block_additional_residual=torch.float32, - ): - super().__init__() - self.unet = unet - self.sample_dtype = sample_dtype - self.timestep_dtype = timestep_dtype - self.encoder_hidden_states_dtype = encoder_hidden_states - self.down_block_additional_residuals_dtype = down_block_additional_residuals - self.mid_block_additional_residual_dtype = mid_block_additional_residual - - def forward( - self, - sample: torch.Tensor, - timestep: torch.Tensor, - encoder_hidden_states: torch.Tensor, - down_block_additional_residuals: Tuple[torch.Tensor], - mid_block_additional_residual: torch.Tensor, - ): - sample.to(self.sample_dtype) - timestep.to(self.timestep_dtype) - encoder_hidden_states.to(self.encoder_hidden_states_dtype) - down_block_additional_residuals = [res.to(self.down_block_additional_residuals_dtype) for res in down_block_additional_residuals] - mid_block_additional_residual.to(self.mid_block_additional_residual_dtype) - return self.unet( - sample, - timestep, - encoder_hidden_states, - down_block_additional_residuals=down_block_additional_residuals, - mid_block_additional_residual=mid_block_additional_residual, - ) - - - def flattenize_inputs(inputs): - flatten_inputs = [] - for input_data in inputs: - if input_data is None: - continue - if isinstance(input_data, (list, tuple)): - flatten_inputs.extend(flattenize_inputs(input_data)) - else: - flatten_inputs.append(input_data) - return flatten_inputs - - - if not UNET_OV_PATH.exists(): - inputs.pop("controlnet_cond", None) - inputs["down_block_additional_residuals"] = down_block_res_samples - inputs["mid_block_additional_residual"] = mid_block_res_sample - - unet = UnetWrapper(pipe.unet) - unet.eval() - - with torch.no_grad(): - ov_model = ov.convert_model(unet, example_input=inputs) - - flatten_inputs = flattenize_inputs(inputs.values()) - for input_data, input_tensor in zip(flatten_inputs, ov_model.inputs): - input_tensor.get_node().set_partial_shape(ov.PartialShape(input_data.shape)) - input_tensor.get_node().set_element_type(dtype_mapping[input_data.dtype]) - ov_model.validate_nodes_and_infer_types() - ov.save_model(ov_model, UNET_OV_PATH) - del ov_model - cleanup_torchscript_cache() - del unet - del pipe.unet - gc.collect() - print("Unet successfully converted to IR") - else: - del pipe.unet - print(f"Unet will be loaded from {UNET_OV_PATH}") - gc.collect() - - -.. parsed-literal:: - - Unet will be loaded from unet_controlnet.xml - - - - -.. parsed-literal:: - - 0 - - - -Text Encoder -~~~~~~~~~~~~ - - - -The text-encoder is responsible for transforming the input prompt, for -example, “a photo of an astronaut riding a horse” into an embedding -space that can be understood by the U-Net. It is usually a simple -transformer-based encoder that maps a sequence of input tokens to a -sequence of latent text embeddings. - -The input of the text encoder is tensor ``input_ids``, which contains -indexes of tokens from text processed by the tokenizer and padded to the -maximum length accepted by the model. Model outputs are two tensors: -``last_hidden_state`` - hidden state from the last MultiHeadAttention -layer in the model and ``pooler_out`` - pooled output for whole model -hidden states. - -.. code:: ipython3 - - TEXT_ENCODER_OV_PATH = Path("text_encoder.xml") - - - def convert_encoder(text_encoder: torch.nn.Module, ir_path: Path): - """ - Convert Text Encoder model to OpenVINO IR. - Function accepts text encoder model, prepares example inputs for conversion, and convert it to OpenVINO Model - Parameters: - text_encoder (torch.nn.Module): text_encoder model - ir_path (Path): File for storing model - Returns: - None - """ - if not ir_path.exists(): - input_ids = torch.ones((1, 77), dtype=torch.long) - # switch model to inference mode - text_encoder.eval() - - # disable gradients calculation for reducing memory consumption - with torch.no_grad(): - ov_model = ov.convert_model( - text_encoder, # model instance - example_input=input_ids, # inputs for model tracing - input=([1, 77],), - ) - ov.save_model(ov_model, ir_path) - del ov_model - cleanup_torchscript_cache() - print("Text Encoder successfully converted to IR") - - - if not TEXT_ENCODER_OV_PATH.exists(): - convert_encoder(pipe.text_encoder, TEXT_ENCODER_OV_PATH) - else: - print(f"Text encoder will be loaded from {TEXT_ENCODER_OV_PATH}") - del pipe.text_encoder - gc.collect() - - -.. parsed-literal:: - - Text encoder will be loaded from text_encoder.xml - - - - -.. parsed-literal:: - - 0 - - - -VAE Decoder conversion -~~~~~~~~~~~~~~~~~~~~~~ - - - -The VAE model has two parts, an encoder, and a decoder. The encoder is -used to convert the image into a low-dimensional latent representation, -which will serve as the input to the U-Net model. The decoder, -conversely, transforms the latent representation back into an image. - -During latent diffusion training, the encoder is used to get the latent -representations (latents) of the images for the forward diffusion -process, which applies more and more noise at each step. During -inference, the denoised latents generated by the reverse diffusion -process are converted back into images using the VAE decoder. During -inference, we will see that we **only need the VAE decoder**. You can -find instructions on how to convert the encoder part in a stable -diffusion -`notebook `__. - -.. code:: ipython3 - - VAE_DECODER_OV_PATH = Path("vae_decoder.xml") - - - def convert_vae_decoder(vae: torch.nn.Module, ir_path: Path): - """ - Convert VAE model to IR format. - Function accepts pipeline, creates wrapper class for export only necessary for inference part, - prepares example inputs for convert, - Parameters: - vae (torch.nn.Module): VAE model - ir_path (Path): File for storing model - Returns: - None - """ - - class VAEDecoderWrapper(torch.nn.Module): - def __init__(self, vae): - super().__init__() - self.vae = vae - - def forward(self, latents): - return self.vae.decode(latents) - - if not ir_path.exists(): - vae_decoder = VAEDecoderWrapper(vae) - latents = torch.zeros((1, 4, 64, 64)) - - vae_decoder.eval() - with torch.no_grad(): - ov_model = ov.convert_model( - vae_decoder, - example_input=latents, - input=[ - (1, 4, 64, 64), - ], - ) - ov.save_model(ov_model, ir_path) - del ov_model - cleanup_torchscript_cache() - print("VAE decoder successfully converted to IR") - - - if not VAE_DECODER_OV_PATH.exists(): - convert_vae_decoder(pipe.vae, VAE_DECODER_OV_PATH) - else: - print(f"VAE decoder will be loaded from {VAE_DECODER_OV_PATH}") - - -.. parsed-literal:: - - VAE decoder will be loaded from vae_decoder.xml - - -Prepare Inference pipeline --------------------------- - - - -Putting it all together, let us now take a closer look at how the model -works in inference by illustrating the logical flow. |detailed workflow| - -The stable diffusion model takes both a latent seed and a text prompt as -input. The latent seed is then used to generate random latent image -representations of size :math:`64 \times 64` where as the text prompt is -transformed to text embeddings of size :math:`77 \times 768` via CLIP’s -text encoder. - -Next, the U-Net iteratively *denoises* the random latent image -representations while being conditioned on the text embeddings. In -comparison with the original stable-diffusion pipeline, latent image -representation, encoder hidden states, and control condition annotation -passed via ControlNet on each denoising step for obtaining middle and -down blocks attention parameters, these attention blocks results -additionally will be provided to the UNet model for the control -generation process. The output of the U-Net, being the noise residual, -is used to compute a denoised latent image representation via a -scheduler algorithm. Many different scheduler algorithms can be used for -this computation, each having its pros and cons. For Stable Diffusion, -it is recommended to use one of: - -- `PNDM - scheduler `__ -- `DDIM - scheduler `__ -- `K-LMS - scheduler `__ - -Theory on how the scheduler algorithm function works is out of scope for -this notebook, but in short, you should remember that they compute the -predicted denoised image representation from the previous noise -representation and the predicted noise residual. For more information, -it is recommended to look into `Elucidating the Design Space of -Diffusion-Based Generative Models `__ - -In this tutorial, instead of using Stable Diffusion’s default -`PNDMScheduler `__, -we use one of the currently fastest diffusion model schedulers, called -`UniPCMultistepScheduler `__. -Choosing an improved scheduler can drastically reduce inference time - -in this case, we can reduce the number of inference steps from 50 to 20 -while more or less keeping the same image generation quality. More -information regarding schedulers can be found -`here `__. - -The *denoising* process is repeated a given number of times (by default -50) to step-by-step retrieve better latent image representations. Once -complete, the latent image representation is decoded by the decoder part -of the variational auto-encoder. - -Similarly to Diffusers ``StableDiffusionControlNetPipeline``, we define -our own ``OVContrlNetStableDiffusionPipeline`` inference pipeline based -on OpenVINO. - -.. |detailed workflow| image:: https://user-images.githubusercontent.com/29454499/224261720-2d20ca42-f139-47b7-b8b9-0b9f30e1ae1e.png - -.. code:: ipython3 - - from diffusers import DiffusionPipeline - from transformers import CLIPTokenizer - from typing import Union, List, Optional, Tuple - import cv2 - - - def scale_fit_to_window(dst_width: int, dst_height: int, image_width: int, image_height: int): - """ - Preprocessing helper function for calculating image size for resize with peserving original aspect ratio - and fitting image to specific window size - - Parameters: - dst_width (int): destination window width - dst_height (int): destination window height - image_width (int): source image width - image_height (int): source image height - Returns: - result_width (int): calculated width for resize - result_height (int): calculated height for resize - """ - im_scale = min(dst_height / image_height, dst_width / image_width) - return int(im_scale * image_width), int(im_scale * image_height) - - - def preprocess(image: Image.Image): - """ - Image preprocessing function. Takes image in PIL.Image format, resizes it to keep aspect ration and fits to model input window 512x512, - then converts it to np.ndarray and adds padding with zeros on right or bottom side of image (depends from aspect ratio), after that - converts data to float32 data type and change range of values from [0, 255] to [-1, 1], finally, converts data layout from planar NHWC to NCHW. - The function returns preprocessed input tensor and padding size, which can be used in postprocessing. - - Parameters: - image (Image.Image): input image - Returns: - image (np.ndarray): preprocessed image tensor - pad (Tuple[int]): pading size for each dimension for restoring image size in postprocessing - """ - src_width, src_height = image.size - dst_width, dst_height = scale_fit_to_window(512, 512, src_width, src_height) - image = np.array(image.resize((dst_width, dst_height), resample=Image.Resampling.LANCZOS))[None, :] - pad_width = 512 - dst_width - pad_height = 512 - dst_height - pad = ((0, 0), (0, pad_height), (0, pad_width), (0, 0)) - image = np.pad(image, pad, mode="constant") - image = image.astype(np.float32) / 255.0 - image = image.transpose(0, 3, 1, 2) - return image, pad - - - def randn_tensor( - shape: Union[Tuple, List], - dtype: Optional[np.dtype] = np.float32, - ): - """ - Helper function for generation random values tensor with given shape and data type - - Parameters: - shape (Union[Tuple, List]): shape for filling random values - dtype (np.dtype, *optiona*, np.float32): data type for result - Returns: - latents (np.ndarray): tensor with random values with given data type and shape (usually represents noise in latent space) - """ - latents = np.random.randn(*shape).astype(dtype) - - return latents - - - class OVContrlNetStableDiffusionPipeline(DiffusionPipeline): - """ - OpenVINO inference pipeline for Stable Diffusion with ControlNet guidence - """ - - def __init__( - self, - tokenizer: CLIPTokenizer, - scheduler, - core: ov.Core, - controlnet: ov.Model, - text_encoder: ov.Model, - unet: ov.Model, - vae_decoder: ov.Model, - device: str = "AUTO", - ): - super().__init__() - self.tokenizer = tokenizer - self.vae_scale_factor = 8 - self.scheduler = scheduler - self.load_models(core, device, controlnet, text_encoder, unet, vae_decoder) - self.set_progress_bar_config(disable=True) - - def load_models( - self, - core: ov.Core, - device: str, - controlnet: ov.Model, - text_encoder: ov.Model, - unet: ov.Model, - vae_decoder: ov.Model, - ): - """ - Function for loading models on device using OpenVINO - - Parameters: - core (Core): OpenVINO runtime Core class instance - device (str): inference device - controlnet (Model): OpenVINO Model object represents ControlNet - text_encoder (Model): OpenVINO Model object represents text encoder - unet (Model): OpenVINO Model object represents UNet - vae_decoder (Model): OpenVINO Model object represents vae decoder - Returns - None - """ - self.text_encoder = core.compile_model(text_encoder, device) - self.text_encoder_out = self.text_encoder.output(0) - self.register_to_config(controlnet=core.compile_model(controlnet, device)) - self.register_to_config(unet=core.compile_model(unet, device)) - self.unet_out = self.unet.output(0) - self.vae_decoder = core.compile_model(vae_decoder, device) - self.vae_decoder_out = self.vae_decoder.output(0) - - def __call__( - self, - prompt: Union[str, List[str]], - image: Image.Image, - num_inference_steps: int = 10, - negative_prompt: Union[str, List[str]] = None, - guidance_scale: float = 7.5, - controlnet_conditioning_scale: float = 1.0, - eta: float = 0.0, - latents: Optional[np.array] = None, - output_type: Optional[str] = "pil", - ): - """ - Function invoked when calling the pipeline for generation. - - Parameters: - prompt (`str` or `List[str]`): - The prompt or prompts to guide the image generation. - image (`Image.Image`): - `Image`, or tensor representing an image batch which will be repainted according to `prompt`. - num_inference_steps (`int`, *optional*, defaults to 100): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - negative_prompt (`str` or `List[str]`): - negative prompt or prompts for generation - guidance_scale (`float`, *optional*, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. This pipeline requires a value of at least `1`. - latents (`np.ndarray`, *optional*): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `Image.Image` or `np.array`. - Returns: - image ([List[Union[np.ndarray, Image.Image]]): generaited images - - """ - - # 1. Define call parameters - batch_size = 1 if isinstance(prompt, str) else len(prompt) - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - # 2. Encode input prompt - text_embeddings = self._encode_prompt(prompt, negative_prompt=negative_prompt) - - # 3. Preprocess image - orig_width, orig_height = image.size - image, pad = preprocess(image) - height, width = image.shape[-2:] - if do_classifier_free_guidance: - image = np.concatenate(([image] * 2)) - - # 4. set timesteps - self.scheduler.set_timesteps(num_inference_steps) - timesteps = self.scheduler.timesteps - - # 6. Prepare latent variables - num_channels_latents = 4 - latents = self.prepare_latents( - batch_size, - num_channels_latents, - height, - width, - text_embeddings.dtype, - latents, - ) - - # 7. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - # Expand the latents if we are doing classifier free guidance. - # The latents are expanded 3 times because for pix2pix the guidance\ - # is applied for both the text and the input image. - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - - result = self.controlnet([latent_model_input, t, text_embeddings, image]) - down_and_mid_blok_samples = [sample * controlnet_conditioning_scale for _, sample in result.items()] - - # predict the noise residual - noise_pred = self.unet([latent_model_input, t, text_embeddings, *down_and_mid_blok_samples])[self.unet_out] - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1] - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(torch.from_numpy(noise_pred), t, torch.from_numpy(latents)).prev_sample.numpy() - - # update progress - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - - # 8. Post-processing - image = self.decode_latents(latents, pad) - - # 9. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) - image = [img.resize((orig_width, orig_height), Image.Resampling.LANCZOS) for img in image] - else: - image = [cv2.resize(img, (orig_width, orig_width)) for img in image] - - return image - - def _encode_prompt( - self, - prompt: Union[str, List[str]], - num_images_per_prompt: int = 1, - do_classifier_free_guidance: bool = True, - negative_prompt: Union[str, List[str]] = None, - ): - """ - Encodes the prompt into text encoder hidden states. - - Parameters: - prompt (str or list(str)): prompt to be encoded - num_images_per_prompt (int): number of images that should be generated per prompt - do_classifier_free_guidance (bool): whether to use classifier free guidance or not - negative_prompt (str or list(str)): negative prompt to be encoded - Returns: - text_embeddings (np.ndarray): text encoder hidden states - """ - batch_size = len(prompt) if isinstance(prompt, list) else 1 - - # tokenize input prompts - text_inputs = self.tokenizer( - prompt, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="np", - ) - text_input_ids = text_inputs.input_ids - - text_embeddings = self.text_encoder(text_input_ids)[self.text_encoder_out] - - # duplicate text embeddings for each generation per prompt - if num_images_per_prompt != 1: - bs_embed, seq_len, _ = text_embeddings.shape - text_embeddings = np.tile(text_embeddings, (1, num_images_per_prompt, 1)) - text_embeddings = np.reshape(text_embeddings, (bs_embed * num_images_per_prompt, seq_len, -1)) - - # get unconditional embeddings for classifier free guidance - if do_classifier_free_guidance: - uncond_tokens: List[str] - max_length = text_input_ids.shape[-1] - if negative_prompt is None: - uncond_tokens = [""] * batch_size - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] - else: - uncond_tokens = negative_prompt - uncond_input = self.tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="np", - ) - - uncond_embeddings = self.text_encoder(uncond_input.input_ids)[self.text_encoder_out] - - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - seq_len = uncond_embeddings.shape[1] - uncond_embeddings = np.tile(uncond_embeddings, (1, num_images_per_prompt, 1)) - uncond_embeddings = np.reshape(uncond_embeddings, (batch_size * num_images_per_prompt, seq_len, -1)) - - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - text_embeddings = np.concatenate([uncond_embeddings, text_embeddings]) - - return text_embeddings - - def prepare_latents( - self, - batch_size: int, - num_channels_latents: int, - height: int, - width: int, - dtype: np.dtype = np.float32, - latents: np.ndarray = None, - ): - """ - Preparing noise to image generation. If initial latents are not provided, they will be generated randomly, - then prepared latents scaled by the standard deviation required by the scheduler - - Parameters: - batch_size (int): input batch size - num_channels_latents (int): number of channels for noise generation - height (int): image height - width (int): image width - dtype (np.dtype, *optional*, np.float32): dtype for latents generation - latents (np.ndarray, *optional*, None): initial latent noise tensor, if not provided will be generated - Returns: - latents (np.ndarray): scaled initial noise for diffusion - """ - shape = ( - batch_size, - num_channels_latents, - height // self.vae_scale_factor, - width // self.vae_scale_factor, - ) - if latents is None: - latents = randn_tensor(shape, dtype=dtype) - else: - latents = latents - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * self.scheduler.init_noise_sigma - return latents - - def decode_latents(self, latents: np.array, pad: Tuple[int]): - """ - Decode predicted image from latent space using VAE Decoder and unpad image result - - Parameters: - latents (np.ndarray): image encoded in diffusion latent space - pad (Tuple[int]): each side padding sizes obtained on preprocessing step - Returns: - image: decoded by VAE decoder image - """ - latents = 1 / 0.18215 * latents - image = self.vae_decoder(latents)[self.vae_decoder_out] - (_, end_h), (_, end_w) = pad[1:3] - h, w = image.shape[2:] - unpad_h = h - end_h - unpad_w = w - end_w - image = image[:, :, :unpad_h, :unpad_w] - image = np.clip(image / 2 + 0.5, 0, 1) - image = np.transpose(image, (0, 2, 3, 1)) - return image - -.. code:: ipython3 - - from transformers import CLIPTokenizer - from diffusers import UniPCMultistepScheduler - - tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") - scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config) - - - def visualize_results(orig_img: Image.Image, skeleton_img: Image.Image, result_img: Image.Image): - """ - Helper function for results visualization - - Parameters: - orig_img (Image.Image): original image - skeleton_img (Image.Image): image with body pose keypoints - result_img (Image.Image): generated image - Returns: - fig (matplotlib.pyplot.Figure): matplotlib generated figure contains drawing result - """ - orig_title = "Original image" - skeleton_title = "Pose" - orig_img = orig_img.resize(result_img.size) - im_w, im_h = orig_img.size - is_horizontal = im_h <= im_w - figsize = (20, 20) - fig, axs = plt.subplots( - 3 if is_horizontal else 1, - 1 if is_horizontal else 3, - figsize=figsize, - sharex="all", - sharey="all", - ) - fig.patch.set_facecolor("white") - list_axes = list(axs.flat) - for a in list_axes: - a.set_xticklabels([]) - a.set_yticklabels([]) - a.get_xaxis().set_visible(False) - a.get_yaxis().set_visible(False) - a.grid(False) - list_axes[0].imshow(np.array(orig_img)) - list_axes[1].imshow(np.array(skeleton_img)) - list_axes[2].imshow(np.array(result_img)) - list_axes[0].set_title(orig_title, fontsize=15) - list_axes[1].set_title(skeleton_title, fontsize=15) - list_axes[2].set_title("Result", fontsize=15) - fig.subplots_adjust(wspace=0.01 if is_horizontal else 0.00, hspace=0.01 if is_horizontal else 0.1) - fig.tight_layout() - fig.savefig("result.png", bbox_inches="tight") - return fig - -Running Text-to-Image Generation with ControlNet Conditioning and OpenVINO --------------------------------------------------------------------------- - - - -Now, we are ready to start generation. For improving the generation -process, we also introduce an opportunity to provide a -``negative prompt``. Technically, positive prompt steers the diffusion -toward the images associated with it, while negative prompt steers the -diffusion away from it. More explanation of how it works can be found in -this -`article `__. -We can keep this field empty if we want to generate image without -negative prompting. - -Select inference device for Stable Diffusion pipeline ------------------------------------------------------ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - core = ov.Core() - - device = device_widget("CPU") - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -.. code:: ipython3 - - ov_pipe = OVContrlNetStableDiffusionPipeline( - tokenizer, - scheduler, - core, - CONTROLNET_OV_PATH, - TEXT_ENCODER_OV_PATH, - UNET_OV_PATH, - VAE_DECODER_OV_PATH, - device=device.value, - ) - -.. code:: ipython3 - - np.random.seed(42) - - pose = pose_estimator(img) - - prompt = "Dancing Darth Vader, best quality, extremely detailed" - negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality" - result = ov_pipe(prompt, pose, 20, negative_prompt=negative_prompt) - - result[0] - - -.. parsed-literal:: - - /home/ltalamanova/tmp_venv/lib/python3.11/site-packages/diffusers/configuration_utils.py:139: FutureWarning: Accessing config attribute `controlnet` directly via 'OVContrlNetStableDiffusionPipeline' object attribute is deprecated. Please access 'controlnet' over 'OVContrlNetStableDiffusionPipeline's config object instead, e.g. 'scheduler.config.controlnet'. - deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False) - - - - -.. image:: controlnet-stable-diffusion-with-output_files/controlnet-stable-diffusion-with-output_34_1.png - - - -Quantization ------------- - - - -`NNCF `__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -According to ``OVContrlNetStableDiffusionPipeline`` structure, -ControlNet and UNet are used in the cycle repeating inference on each -diffusion step, while other parts of pipeline take part only once. That -is why computation cost and speed of ControlNet and UNet become the -critical path in the pipeline. Quantizing the rest of the SD pipeline -does not significantly improve inference performance but can lead to a -substantial degradation of accuracy. - -The optimization process contains the following steps: - -1. Create a calibration dataset for quantization. -2. Run ``nncf.quantize()`` to obtain quantized model. -3. Save the ``INT8`` model using ``openvino.save_model()`` function. - -Please select below whether you would like to run quantization to -improve model inference speed. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - to_quantize = quantization_widget() - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -Let’s load ``skip magic`` extension to skip quantization if -``to_quantize`` is not selected - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - import requests - - - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - int8_pipe = None - - %load_ext skip_kernel_extension - -Prepare calibration datasets -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We use a portion of -`jschoormans/humanpose_densepose `__ -dataset from Hugging Face as calibration data. We use a prompts below as -negative prompts for ControlNet and UNet. To collect intermediate model -inputs for calibration we should customize ``CompiledModel``. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - negative_prompts = [ - "blurry unreal occluded", - "low contrast disfigured uncentered mangled", - "amateur out of frame low quality nsfw", - "ugly underexposed jpeg artifacts", - "low saturation disturbing content", - "overexposed severe distortion", - "amateur NSFW", - "ugly mutilated out of frame disfigured", - ] - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import datasets - - num_inference_steps = 20 - subset_size = 200 - - dataset = datasets.load_dataset("jschoormans/humanpose_densepose", split="train", streaming=True).shuffle(seed=42) - input_data = [] - for batch in dataset: - caption = batch["caption"] - if len(caption) > tokenizer.model_max_length: - continue - img = batch["file_name"] - input_data.append((caption, pose_estimator(img))) - if len(input_data) >= subset_size // num_inference_steps: - break - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import datasets - from tqdm.notebook import tqdm - from transformers import set_seed - from typing import Any, Dict, List - - set_seed(42) - - class CompiledModelDecorator(ov.CompiledModel): - def __init__(self, compiled_model: ov.CompiledModel, keep_prob: float = 1.0): - super().__init__(compiled_model) - self.data_cache = [] - self.keep_prob = np.clip(keep_prob, 0, 1) - - def __call__(self, *args, **kwargs): - if np.random.rand() <= self.keep_prob: - self.data_cache.append(*args) - return super().__call__(*args, **kwargs) - - def collect_calibration_data(pipeline: OVContrlNetStableDiffusionPipeline, subset_size: int) -> List[Dict]: - original_unet = pipeline.unet - pipeline.unet = CompiledModelDecorator(original_unet) - pipeline.set_progress_bar_config(disable=True) - - pbar = tqdm(total=subset_size) - for prompt, pose in input_data: - img = batch["file_name"] - negative_prompt = np.random.choice(negative_prompts) - _ = pipeline(prompt, pose, num_inference_steps, negative_prompt=negative_prompt) - collected_subset_size = len(pipeline.unet.data_cache) - pbar.update(collected_subset_size - pbar.n) - if collected_subset_size >= subset_size: - break - - calibration_dataset = pipeline.unet.data_cache[:subset_size] - pipeline.set_progress_bar_config(disable=False) - pipeline.unet = original_unet - return calibration_dataset - -.. code:: ipython3 - - %%skip not $to_quantize.value - - CONTROLNET_INT8_OV_PATH = Path("controlnet-pose_int8.xml") - UNET_INT8_OV_PATH = Path("unet_controlnet_int8.xml") - - if not (CONTROLNET_INT8_OV_PATH.exists() and UNET_INT8_OV_PATH.exists()): - unet_calibration_data = collect_calibration_data(ov_pipe, subset_size=subset_size) - - - -.. parsed-literal:: - - 0%| | 0/200 [00:00`__ -- `Fetching example models <#fetching-example-models>`__ -- `Conversion <#conversion>`__ - - - `Setting Input Shapes <#setting-input-shapes>`__ - - `Compressing a Model to FP16 <#compressing-a-model-to-fp16>`__ - - `Convert Models from memory <#convert-models-from-memory>`__ - -- `Migration from Legacy conversion - API <#migration-from-legacy-conversion-api>`__ - - - `Specifying Layout <#specifying-layout>`__ - - `Changing Model Layout <#changing-model-layout>`__ - - `Specifying Mean and Scale - Values <#specifying-mean-and-scale-values>`__ - - `Reversing Input Channels <#reversing-input-channels>`__ - - `Cutting Off Parts of a Model <#cutting-off-parts-of-a-model>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - # Required imports. Please execute this cell first. - %pip install -q "tensorflow-macos>=2.5; sys_platform == 'darwin' and platform_machine == 'arm64'" - %pip install -q "tensorflow>=2.5; sys_platform == 'darwin' and platform_machine != 'arm64'" - %pip install -q "tensorflow>=2.5; sys_platform != 'darwin'" - %pip install -q --no-deps "tensorflow-hub" - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu \ - "openvino>=2024.4.0" "requests" "tqdm" "transformers>=4.31" "onnx!=1.16.2" "torch>=2.1" "torchvision" "tf_keras" - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. - fastapi 0.115.8 requires typing-extensions>=4.8.0, but you have typing-extensions 4.5.0 which is incompatible. - mypy 1.14.1 requires typing-extensions>=4.6.0, but you have typing-extensions 4.5.0 which is incompatible. - pydantic 2.10.6 requires typing-extensions>=4.12.2, but you have typing-extensions 4.5.0 which is incompatible. - pydantic-core 2.27.2 requires typing-extensions!=4.7.0,>=4.6.0, but you have typing-extensions 4.5.0 which is incompatible. - torch 2.4.1+cpu requires typing-extensions>=4.8.0, but you have typing-extensions 4.5.0 which is incompatible. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. - tensorflow 2.13.1 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.12.2 which is incompatible. - Note: you may need to restart the kernel to use updated packages. - - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("convert-to-openvino.ipynb") - -OpenVINO IR format ------------------- - - - -OpenVINO `Intermediate Representation -(IR) `__ -is the proprietary model format of OpenVINO. It is produced after -converting a model with model conversion API. Model conversion API -translates the frequently used deep learning operations to their -respective similar representation in OpenVINO and tunes them with the -associated weights and biases from the trained model. The resulting IR -contains two files: an ``.xml`` file, containing information about -network topology, and a ``.bin`` file, containing the weights and biases -binary data. - -There are two ways to convert a model from the original framework format -to OpenVINO IR: Python conversion API and OVC command-line tool. You can -choose one of them based on whichever is most convenient for you. - -OpenVINO conversion API supports next model formats: ``PyTorch``, -``TensorFlow``, ``TensorFlow Lite``, ``ONNX``, and ``PaddlePaddle``. -These model formats can be read, compiled, and converted to OpenVINO IR, -either automatically or explicitly. - -For more details, refer to `Model -Preparation `__ -documentation. - -.. code:: ipython3 - - # OVC CLI tool parameters description - - ! ovc --help - - -.. parsed-literal:: - - usage: ovc INPUT_MODEL... [-h] [--output_model OUTPUT_MODEL] - [--compress_to_fp16 [True | False]] [--version] [--input INPUT] - [--output OUTPUT] [--extension EXTENSION] [--verbose] - - positional arguments: - INPUT_MODEL Input model file(s) from TensorFlow, ONNX, - PaddlePaddle. Use openvino.convert_model in Python to - convert models from PyTorch. - - optional arguments: - -h, --help show this help message and exit - --output_model OUTPUT_MODEL - This parameter is used to name output .xml/.bin files - of converted model. Model name or output directory can - be passed. If output directory is passed, the - resulting .xml/.bin files are named by original model - name. - --compress_to_fp16 [True | False] - Compress weights in output OpenVINO model to FP16. To - turn off compression use "--compress_to_fp16=False" - command line parameter. Default value is True. - --version Print ovc version and exit. - --input INPUT Information of model input required for model - conversion. This is a comma separated list with - optional input names and shapes. The order of inputs - in converted model will match the order of specified - inputs. The shape is specified as comma-separated - list. Example, to set `input_1` input with shape - [1,100] and `sequence_len` input with shape [1,?]: - "input_1[1,100],sequence_len[1,?]", where "?" is a - dynamic dimension, which means that such a dimension - can be specified later in the runtime. If the - dimension is set as an integer (like 100 in [1,100]), - such a dimension is not supposed to be changed later, - during a model conversion it is treated as a static - value. Example with unnamed inputs: "[1,100],[1,?]". - --output OUTPUT One or more comma-separated model outputs to be - preserved in the converted model. Other outputs are - removed. If `output` parameter is not specified then - all outputs from the original model are preserved. Do - not add :0 to the names for TensorFlow. The order of - outputs in the converted model is the same as the - order of specified names. Example: ovc model.onnx - output=out_1,out_2 - --extension EXTENSION - Paths or a comma-separated list of paths to libraries - (.so or .dll) with extensions. - --verbose Print detailed information about conversion. - - -Fetching example models ------------------------ - - - -This notebook uses two models for conversion examples: - -- `Distilbert `__ - NLP model from Hugging Face -- `Resnet50 `__ - CV classification model from torchvision - -.. code:: ipython3 - - from pathlib import Path - - # create a directory for models files - MODEL_DIRECTORY_PATH = Path("model") - MODEL_DIRECTORY_PATH.mkdir(exist_ok=True) - -Fetch -`distilbert `__ -NLP model from Hugging Face and export it in ONNX format: - -.. code:: ipython3 - - from transformers import AutoModelForSequenceClassification, AutoTokenizer - import torch - - ONNX_NLP_MODEL_PATH = MODEL_DIRECTORY_PATH / "distilbert.onnx" - - # download model - hf_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english") - # initialize tokenizer - tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english") - - if not ONNX_NLP_MODEL_PATH.exists(): - inputs = tokenizer("Hi, how are you?", return_tensors="pt") - input_names = list(inputs.keys()) - dynamic_axes = {input_name: {0: "batch_size", 1: "seq_length"} for input_name in input_names} - torch.onnx.export( - hf_model, args=dict(inputs), input_names=input_names, output_names=["logits"], dynamic_axes=dynamic_axes, f=ONNX_NLP_MODEL_PATH, opset_version=14 - ) - print(f"ONNX model exported to {ONNX_NLP_MODEL_PATH}") - - -.. parsed-literal:: - - 2025-02-03 23:50:32.463242: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2025-02-03 23:50:32.496498: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2025-02-03 23:50:33.168481: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - - -.. parsed-literal:: - - ONNX model exported to model/distilbert.onnx - - -Fetch -`Resnet50 `__ -CV classification model from torchvision: - -.. code:: ipython3 - - from torchvision.models import resnet50, ResNet50_Weights - - # create model object - pytorch_model = resnet50(weights=ResNet50_Weights.DEFAULT) - # switch model from training to inference mode - pytorch_model.eval() - - - - -.. parsed-literal:: - - ResNet( - (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False) - (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU(inplace=True) - (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False) - (layer1): Sequential( - (0): Bottleneck( - (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) - (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU(inplace=True) - (downsample): Sequential( - (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) - (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - ) - ) - (1): Bottleneck( - (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) - (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU(inplace=True) - ) - (2): Bottleneck( - (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) - (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU(inplace=True) - ) - ) - (layer2): Sequential( - (0): Bottleneck( - (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) - (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU(inplace=True) - (downsample): Sequential( - (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False) - (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - ) - ) - (1): Bottleneck( - (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) - (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU(inplace=True) - ) - (2): Bottleneck( - (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) - (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU(inplace=True) - ) - (3): Bottleneck( - (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) - (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU(inplace=True) - ) - ) - (layer3): Sequential( - (0): Bottleneck( - (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) - (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU(inplace=True) - (downsample): Sequential( - (0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False) - (1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - ) - ) - (1): Bottleneck( - (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) - (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU(inplace=True) - ) - (2): Bottleneck( - (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) - (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU(inplace=True) - ) - (3): Bottleneck( - (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) - (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU(inplace=True) - ) - (4): Bottleneck( - (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) - (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU(inplace=True) - ) - (5): Bottleneck( - (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) - (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU(inplace=True) - ) - ) - (layer4): Sequential( - (0): Bottleneck( - (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) - (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU(inplace=True) - (downsample): Sequential( - (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False) - (1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - ) - ) - (1): Bottleneck( - (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) - (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU(inplace=True) - ) - (2): Bottleneck( - (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) - (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) - (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) - (relu): ReLU(inplace=True) - ) - ) - (avgpool): AdaptiveAvgPool2d(output_size=(1, 1)) - (fc): Linear(in_features=2048, out_features=1000, bias=True) - ) - - - -Convert PyTorch model to ONNX format: - -.. code:: ipython3 - - import torch - import warnings - - ONNX_CV_MODEL_PATH = MODEL_DIRECTORY_PATH / "resnet.onnx" - - if ONNX_CV_MODEL_PATH.exists(): - print(f"ONNX model {ONNX_CV_MODEL_PATH} already exists.") - else: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore") - torch.onnx.export(model=pytorch_model, args=torch.randn(1, 3, 224, 224), f=ONNX_CV_MODEL_PATH) - print(f"ONNX model exported to {ONNX_CV_MODEL_PATH}") - - -.. parsed-literal:: - - ONNX model exported to model/resnet.onnx - - -Conversion ----------- - - - -To convert a model to OpenVINO IR, use the following API: - -.. code:: ipython3 - - import openvino as ov - - # ov.convert_model returns an openvino.runtime.Model object - print(ONNX_NLP_MODEL_PATH) - ov_model = ov.convert_model(ONNX_NLP_MODEL_PATH) - - # then model can be serialized to *.xml & *.bin files - ov.save_model(ov_model, MODEL_DIRECTORY_PATH / "distilbert.xml") - - -.. parsed-literal:: - - model/distilbert.onnx - - -.. code:: ipython3 - - ! ovc model/distilbert.onnx --output_model model/distilbert.xml - - -.. parsed-literal:: - - huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... - To disable this warning, you can either: - - Avoid using `tokenizers` before the fork if possible - - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) - - -.. parsed-literal:: - - [ INFO ] Generated IR will be compressed to FP16. If you get lower accuracy, please consider disabling compression by removing argument "compress_to_fp16" or set it to false "compress_to_fp16=False". - Find more information about compression to FP16 at https://docs.openvino.ai/2023.0/openvino_docs_MO_DG_FP16_Compression.html - [ SUCCESS ] XML file: model/distilbert.xml - [ SUCCESS ] BIN file: model/distilbert.bin - - -Setting Input Shapes -^^^^^^^^^^^^^^^^^^^^ - - - -Model conversion is supported for models with dynamic input shapes that -contain undefined dimensions. However, if the shape of data is not going -to change from one inference request to another, it is recommended to -set up static shapes (when all dimensions are fully defined) for the -inputs. Doing so at the model preparation stage, not at runtime, can be -beneficial in terms of performance and memory consumption. - -For more information refer to `Setting Input -Shapes `__ -documentation. - -.. code:: ipython3 - - import openvino as ov - - ov_model = ov.convert_model(ONNX_NLP_MODEL_PATH, input=[("input_ids", [1, 128]), ("attention_mask", [1, 128])]) - -.. code:: ipython3 - - ! ovc model/distilbert.onnx --input input_ids[1,128],attention_mask[1,128] --output_model model/distilbert.xml - - -.. parsed-literal:: - - huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... - To disable this warning, you can either: - - Avoid using `tokenizers` before the fork if possible - - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) - - -.. parsed-literal:: - - [ INFO ] Generated IR will be compressed to FP16. If you get lower accuracy, please consider disabling compression by removing argument "compress_to_fp16" or set it to false "compress_to_fp16=False". - Find more information about compression to FP16 at https://docs.openvino.ai/2023.0/openvino_docs_MO_DG_FP16_Compression.html - [ SUCCESS ] XML file: model/distilbert.xml - [ SUCCESS ] BIN file: model/distilbert.bin - - -The ``input`` parameter allows overriding original input shapes if it is -supported by the model topology. Shapes with dynamic dimensions in the -original model can be replaced with static shapes for the converted -model, and vice versa. The dynamic dimension can be marked in model -conversion API parameter as ``-1`` or ``?`` when using ``ovc``: - -.. code:: ipython3 - - import openvino as ov - - ov_model = ov.convert_model(ONNX_NLP_MODEL_PATH, input=[("input_ids", [1, -1]), ("attention_mask", [1, -1])]) - -.. code:: ipython3 - - ! ovc model/distilbert.onnx --input "input_ids[1,?],attention_mask[1,?]" --output_model model/distilbert.xml - - -.. parsed-literal:: - - huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... - To disable this warning, you can either: - - Avoid using `tokenizers` before the fork if possible - - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) - - -.. parsed-literal:: - - [ INFO ] Generated IR will be compressed to FP16. If you get lower accuracy, please consider disabling compression by removing argument "compress_to_fp16" or set it to false "compress_to_fp16=False". - Find more information about compression to FP16 at https://docs.openvino.ai/2023.0/openvino_docs_MO_DG_FP16_Compression.html - [ SUCCESS ] XML file: model/distilbert.xml - [ SUCCESS ] BIN file: model/distilbert.bin - - -To optimize memory consumption for models with undefined dimensions in -runtime, model conversion API provides the capability to define -boundaries of dimensions. The boundaries of undefined dimension can be -specified with ellipsis in the command line or with -``openvino.Dimension`` class in Python. For example, launch model -conversion for the ONNX Bert model and specify a boundary for the -sequence length dimension: - -.. code:: ipython3 - - import openvino as ov - - - sequence_length_dim = ov.Dimension(10, 128) - - ov_model = ov.convert_model( - ONNX_NLP_MODEL_PATH, - input=[ - ("input_ids", [1, sequence_length_dim]), - ("attention_mask", [1, sequence_length_dim]), - ], - ) - -.. code:: ipython3 - - ! ovc model/distilbert.onnx --input input_ids[1,10..128],attention_mask[1,10..128] --output_model model/distilbert.xml - - -.. parsed-literal:: - - huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... - To disable this warning, you can either: - - Avoid using `tokenizers` before the fork if possible - - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) - - -.. parsed-literal:: - - [ INFO ] Generated IR will be compressed to FP16. If you get lower accuracy, please consider disabling compression by removing argument "compress_to_fp16" or set it to false "compress_to_fp16=False". - Find more information about compression to FP16 at https://docs.openvino.ai/2023.0/openvino_docs_MO_DG_FP16_Compression.html - [ SUCCESS ] XML file: model/distilbert.xml - [ SUCCESS ] BIN file: model/distilbert.bin - - -Compressing a Model to FP16 -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -By default model weights compressed to FP16 format when saving OpenVINO -model to IR. This saves up to 2x storage space for the model file and in -most cases doesn’t sacrifice model accuracy. Weight compression can be -disabled by setting ``compress_to_fp16`` flag to ``False``: - -.. code:: ipython3 - - import openvino as ov - - ov_model = ov.convert_model(ONNX_NLP_MODEL_PATH) - ov.save_model(ov_model, MODEL_DIRECTORY_PATH / "distilbert.xml", compress_to_fp16=False) - -.. code:: ipython3 - - ! ovc model/distilbert.onnx --output_model model/distilbert.xml --compress_to_fp16=False - - -.. parsed-literal:: - - huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... - To disable this warning, you can either: - - Avoid using `tokenizers` before the fork if possible - - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) - - -.. parsed-literal:: - - [ SUCCESS ] XML file: model/distilbert.xml - [ SUCCESS ] BIN file: model/distilbert.bin - - -Convert Models from memory -^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -Model conversion API supports passing original framework Python object -directly. More details can be found in -`PyTorch `__, -`TensorFlow `__, -`PaddlePaddle `__ -frameworks conversion guides. - -.. code:: ipython3 - - import openvino as ov - import torch - - example_input = torch.rand(1, 3, 224, 224) - - ov_model = ov.convert_model(pytorch_model, example_input=example_input, input=example_input.shape) - - -.. parsed-literal:: - - WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11. - - -.. code:: ipython3 - - import os - - import openvino as ov - import tensorflow_hub as hub - - os.environ["TFHUB_CACHE_DIR"] = str(Path("./tfhub_modules").resolve()) - - model = hub.load("https://www.kaggle.com/models/google/movenet/frameworks/TensorFlow2/variations/singlepose-lightning/versions/4") - movenet = model.signatures["serving_default"] - - ov_model = ov.convert_model(movenet) - - -.. parsed-literal:: - - 2025-02-03 23:50:49.384602: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform. - Skipping registering GPU devices... - - -Migration from Legacy conversion API ------------------------------------- - - - -In the 2023.1 OpenVINO release OpenVINO Model Conversion API was -introduced with the corresponding Python API: ``openvino.convert_model`` -method. ``ovc`` and ``openvino.convert_model`` represent a lightweight -alternative of ``mo`` and ``openvino.tools.mo.convert_model`` which are -considered legacy API now. ``mo.convert_model()`` provides a wide range -of preprocessing parameters. Most of these parameters have analogs in -OVC or can be replaced with functionality from ``ov.PrePostProcessor`` -class. Refer to `Optimize Preprocessing -notebook `__ for -more information about `Preprocessing -API `__. -Here is the migration guide from legacy model preprocessing to -Preprocessing API. - -Specifying Layout -^^^^^^^^^^^^^^^^^ - - - -Layout defines the meaning of dimensions in a shape and can be specified -for both inputs and outputs. Some preprocessing requires to set input -layouts, for example, setting a batch, applying mean or scales, and -reversing input channels (BGR<->RGB). For the layout syntax, check the -`Layout API -overview `__. -To specify the layout, you can use the layout option followed by the -layout value. - -The following example specifies the ``NCHW`` layout for a Pytorch -Resnet50 model that was exported to the ONNX format: - -.. code:: ipython3 - - # Converter API - import openvino as ov - - ov_model = ov.convert_model(ONNX_CV_MODEL_PATH) - - prep = ov.preprocess.PrePostProcessor(ov_model) - prep.input("input.1").model().set_layout(ov.Layout("nchw")) - ov_model = prep.build() - -.. code:: python - - # Legacy Model Optimizer API - from openvino.tools import mo - - ov_model = mo.convert_model(ONNX_CV_MODEL_PATH, layout="nchw") - -Changing Model Layout -^^^^^^^^^^^^^^^^^^^^^ - - - -Transposing of matrices/tensors is a typical operation in Deep Learning -- you may have a BMP image ``640x480``, which is an array of -``{480, 640, 3}`` elements, but Deep Learning model can require input -with shape ``{1, 3, 480, 640}``. - -Conversion can be done implicitly, using the layout of a user’s tensor -and the layout of an original model: - -.. code:: ipython3 - - # Converter API - import openvino as ov - - ov_model = ov.convert_model(ONNX_CV_MODEL_PATH) - - prep = ov.preprocess.PrePostProcessor(ov_model) - prep.input("input.1").tensor().set_layout(ov.Layout("nhwc")) - prep.input("input.1").model().set_layout(ov.Layout("nchw")) - ov_model = prep.build() - -Legacy Model Optimizer API -========================== - -.. code:: python - - from openvino.tools import mo - - ov_model = mo.convert_model(ONNX_CV_MODEL_PATH, layout="nchw->nhwc") - - # alternatively use source_layout and target_layout parameters - ov_model = mo.convert_model(ONNX_CV_MODEL_PATH, source_layout="nchw", target_layout="nhwc") - -Specifying Mean and Scale Values -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -Using Preprocessing API ``mean`` and ``scale`` values can be set. Using -these API, model embeds the corresponding preprocessing block for -mean-value normalization of the input data and optimizes this block. -Refer to `Optimize Preprocessing -notebook `__ for -more examples. - -.. code:: ipython3 - - # Converter API - import openvino as ov - - ov_model = ov.convert_model(ONNX_CV_MODEL_PATH) - - prep = ov.preprocess.PrePostProcessor(ov_model) - prep.input("input.1").tensor().set_layout(ov.Layout("nchw")) - prep.input("input.1").preprocess().mean([255 * x for x in [0.485, 0.456, 0.406]]) - prep.input("input.1").preprocess().scale([255 * x for x in [0.229, 0.224, 0.225]]) - - ov_model = prep.build() - -.. code:: python - - # Legacy Model Optimizer API - - from openvino.tools import mo - - - ov_model = mo.convert_model( - ONNX_CV_MODEL_PATH, - mean_values=[255 * x for x in [0.485, 0.456, 0.406]], - scale_values=[255 * x for x in [0.229, 0.224, 0.225]], - ) - -Reversing Input Channels -^^^^^^^^^^^^^^^^^^^^^^^^ - - - -Sometimes, input images for your application can be of the ``RGB`` (or -``BGR``) format, and the model is trained on images of the ``BGR`` (or -``RGB``) format, which is in the opposite order of color channels. In -this case, it is important to preprocess the input images by reverting -the color channels before inference. - -.. code:: ipython3 - - # Converter API - import openvino as ov - - ov_model = ov.convert_model(ONNX_CV_MODEL_PATH) - - prep = ov.preprocess.PrePostProcessor(ov_model) - prep.input("input.1").tensor().set_layout(ov.Layout("nchw")) - prep.input("input.1").preprocess().reverse_channels() - ov_model = prep.build() - -.. code:: python - - # Legacy Model Optimizer API - from openvino.tools import mo - - ov_model = mo.convert_model(ONNX_CV_MODEL_PATH, reverse_input_channels=True) - -Cutting Off Parts of a Model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -Cutting model inputs and outputs from a model is no longer available in -the new conversion API. Instead, we recommend performing the cut in the -original framework. Examples of model cutting of TensorFlow protobuf, -TensorFlow SavedModel, and ONNX formats with tools provided by the -Tensorflow and ONNX frameworks can be found in `documentation -guide `__. -For PyTorch, TensorFlow 2 Keras, and PaddlePaddle, we recommend changing -the original model code to perform the model cut. diff --git a/docs/notebooks/convnext-classification-with-output.rst b/docs/notebooks/convnext-classification-with-output.rst deleted file mode 100644 index 5e0a727c942b2e..00000000000000 --- a/docs/notebooks/convnext-classification-with-output.rst +++ /dev/null @@ -1,250 +0,0 @@ -Classification with ConvNeXt and OpenVINO -========================================= - -The -`torchvision.models `__ -subpackage contains definitions of models for addressing different -tasks, including: image classification, pixelwise semantic segmentation, -object detection, instance segmentation, person keypoint detection, -video classification, and optical flow. Throughout this notebook we will -show how to use one of them. - -The ConvNeXt model is based on the `A ConvNet for the -2020s `__ paper. The outcome of this -exploration is a family of pure ConvNet models dubbed ConvNeXt. -Constructed entirely from standard ConvNet modules, ConvNeXts compete -favorably with Transformers in terms of accuracy and scalability, -achieving 87.8% ImageNet top-1 accuracy and outperforming Swin -Transformers on COCO detection and ADE20K segmentation, while -maintaining the simplicity and efficiency of standard ConvNets. The -``torchvision.models`` subpackage -`contains `__ -several pretrained ConvNeXt model. In this tutorial we will use ConvNeXt -Tiny model. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Get a test image <#get-a-test-image>`__ -- `Get a pretrained model <#get-a-pretrained-model>`__ -- `Define a preprocessing and prepare an input - data <#define-a-preprocessing-and-prepare-an-input-data>`__ -- `Use the original model to run an - inference <#use-the-original-model-to-run-an-inference>`__ -- `Convert the model to OpenVINO Intermediate representation - format <#convert-the-model-to-openvino-intermediate-representation-format>`__ -- `Use the OpenVINO IR model to run an - inference <#use-the-openvino-ir-model-to-run-an-inference>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu torch torchvision - %pip install -q "openvino>=2023.1.0" - -Get a test image ----------------- - -First of all lets get a test -image from an open dataset. - -.. code:: ipython3 - - import requests - from pathlib import Path - - from torchvision.io import read_image - import torchvision.transforms as transforms - - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("convnext-classification.ipynb") - - img_path = Path("cats_image.jpeg") - - if not img_path.exists(): - r = requests.get("https://huggingface.co/datasets/huggingface/cats-image/resolve/main/cats_image.jpeg") - - with open(img_path, "wb") as f: - f.write(r.content) - image = read_image(img_path) - display(transforms.ToPILImage()(image)) - -Get a pretrained model ----------------------- - -Torchvision provides a -mechanism of `listing and retrieving available -models `__. - -.. code:: ipython3 - - import torchvision.models as models - - # List available models - all_models = models.list_models() - # List of models by type. Classification models are in the parent module. - classification_models = models.list_models(module=models) - - print(classification_models) - -We will use ``convnext_tiny``. To get a pretrained model just use -``models.get_model("convnext_tiny", weights='DEFAULT')`` or a specific -method of ``torchvision.models`` for this model using `default -weights `__ -that is equivalent to ``ConvNeXt_Tiny_Weights.IMAGENET1K_V1``. If you -don’t specify ``weight`` or specify ``weights=None`` it will be a random -initialization. To get all available weights for the model you can call -``weights_enum = models.get_model_weights("convnext_tiny")``, but there -is only one for this model. You can find more information how to -initialize pre-trained models -`here `__. - -.. code:: ipython3 - - model = models.convnext_tiny(weights=models.ConvNeXt_Tiny_Weights.DEFAULT) - -Define a preprocessing and prepare an input data ------------------------------------------------- - -You can use -``torchvision.transforms`` to make a preprocessing or -use\ `preprocessing transforms from the model -wight `__. - -.. code:: ipython3 - - import torch - - - preprocess = models.ConvNeXt_Tiny_Weights.DEFAULT.transforms() - - input_data = preprocess(image) - input_data = torch.stack([input_data], dim=0) - -Use the original model to run an inference ------------------------------------------- - - - -.. code:: ipython3 - - outputs = model(input_data) - -And print results - -.. code:: ipython3 - - # download class number to class label mapping - imagenet_classes_file_path = Path("imagenet_2012.txt") - - if not imagenet_classes_file_path.exists(): - r = requests.get( - url="https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/datasets/imagenet/imagenet_2012.txt", - ) - - with open(imagenet_classes_file_path, "w") as f: - f.write(r.text) - - imagenet_classes = open(imagenet_classes_file_path).read().splitlines() - - - def print_results(outputs: torch.Tensor): - _, predicted_class = outputs.max(1) - predicted_probability = torch.softmax(outputs, dim=1)[0, predicted_class].item() - - print(f"Predicted Class: {predicted_class.item()}") - print(f"Predicted Label: {imagenet_classes[predicted_class.item()]}") - print(f"Predicted Probability: {predicted_probability}") - -.. code:: ipython3 - - print_results(outputs) - -Convert the model to OpenVINO Intermediate representation format ----------------------------------------------------------------- - - - -OpenVINO supports PyTorch through conversion to OpenVINO Intermediate -Representation (IR) format. To take the advantage of OpenVINO -optimization tools and features, the model should be converted using the -OpenVINO Converter tool (OVC). The ``openvino.convert_model`` function -provides Python API for OVC usage. The function returns the instance of -the OpenVINO Model class, which is ready for use in the Python -interface. However, it can also be saved on disk using -``openvino.save_model`` for future execution. - -.. code:: ipython3 - - from pathlib import Path - - import openvino as ov - - - ov_model_xml_path = Path("models/ov_convnext_model.xml") - - if not ov_model_xml_path.exists(): - ov_model_xml_path.parent.mkdir(parents=True, exist_ok=True) - converted_model = ov.convert_model(model, example_input=torch.randn(1, 3, 224, 224)) - # add transform to OpenVINO preprocessing converting - ov.save_model(converted_model, ov_model_xml_path) - else: - print(f"IR model {ov_model_xml_path} already exists.") - -When the ``openvino.save_model`` function is used, an OpenVINO model is -serialized in the file system as two files with ``.xml`` and ``.bin`` -extensions. This pair of files is called OpenVINO Intermediate -Representation format (OpenVINO IR, or just IR) and useful for efficient -model deployment. OpenVINO IR can be loaded into another application for -inference using the ``openvino.Core.read_model`` function. - -Select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - -.. code:: ipython3 - - core = ov.Core() - - compiled_model = core.compile_model(ov_model_xml_path, device_name=device.value) - -Use the OpenVINO IR model to run an inference ---------------------------------------------- - - - -.. code:: ipython3 - - outputs = compiled_model(input_data)[0] - print_results(torch.from_numpy(outputs)) diff --git a/docs/notebooks/ct-segmentation-quantize-nncf-with-output.rst b/docs/notebooks/ct-segmentation-quantize-nncf-with-output.rst deleted file mode 100644 index 4bd3ce78f923d9..00000000000000 --- a/docs/notebooks/ct-segmentation-quantize-nncf-with-output.rst +++ /dev/null @@ -1,1023 +0,0 @@ -Quantize a Segmentation Model and Show Live Inference -===================================================== - -Kidney Segmentation with PyTorch Lightning and OpenVINO™ - Part 3 ------------------------------------------------------------------ - -This tutorial is a part of a series on how to train, optimize, quantize -and show live inference on a medical segmentation model. The goal is to -accelerate inference on a kidney segmentation model. The -`UNet `__ model is trained from -scratch; the data is from -`Kits19 `__. - -This third tutorial in the series shows how to: - -- Convert an Original model to OpenVINO IR with `model conversion - API `__ -- Quantize a PyTorch model with NNCF -- Evaluate the F1 score metric of the original model and the quantized - model -- Benchmark performance of the FP32 model and the INT8 quantized model -- Show live inference with OpenVINO’s async API - -All notebooks in this series: - -- `Data Preparation for 2D Segmentation of 3D Medical - Data `__ -- `Train a 2D-UNet Medical Imaging Model with PyTorch - Lightning `__ -- Convert and Quantize a Segmentation Model and Show Live Inference - (this notebook) -- `Live Inference and Benchmark CT-scan - data `__ - -Instructions ------------- - -This notebook needs a trained UNet model. We provide a pre-trained -model, trained for 20 epochs with the full -`Kits-19 `__ frames dataset, which -has an F1 score on the validation set of 0.9. The training code is -available in `this notebook `__. - -NNCF for PyTorch models requires a C++ compiler. On Windows, install -`Microsoft Visual Studio -2019 `__. -During installation, choose Desktop development with C++ in the -Workloads tab. On macOS, run ``xcode-select –install`` from a Terminal. -On Linux, install ``gcc``. - -Running this notebook with the full dataset will take a long time. For -demonstration purposes, this tutorial will download one converted CT -scan and use that scan for quantization and inference. For production -purposes, use a representative dataset for quantizing the model. - - -**Table of contents:** - - -- `Imports <#imports>`__ -- `Settings <#settings>`__ -- `Load PyTorch Model <#load-pytorch-model>`__ -- `Download CT-scan Data <#download-ct-scan-data>`__ -- `Configuration <#configuration>`__ - - - `Dataset <#dataset>`__ - - `Metric <#metric>`__ - -- `Quantization <#quantization>`__ -- `Compare FP32 and INT8 Model <#compare-fp32-and-int8-model>`__ - - - `Compare File Size <#compare-file-size>`__ - - `Compare Metrics for the original model and the quantized model to - be sure that there no - degradation. <#compare-metrics-for-the-original-model-and-the-quantized-model-to-be-sure-that-there-no-degradation->`__ - - `Compare Performance of the FP32 IR Model and Quantized - Models <#compare-performance-of-the-fp32-ir-model-and-quantized-models>`__ - - `Visually Compare Inference - Results <#visually-compare-inference-results>`__ - -- `Show Live Inference <#show-live-inference>`__ - - - `Load Model and List of Image - Files <#load-model-and-list-of-image-files>`__ - - `Show Inference <#show-inference>`__ - -- `References <#references>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - %pip install -q "openvino>=2023.3.0" "monai>=0.9.1" "torchmetrics>=0.11.0" "nncf>=2.8.0" "opencv-python" "matplotlib>=3.4" torch tqdm --extra-index-url https://download.pytorch.org/whl/cpu - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - - -Imports -------- - - - -.. code:: ipython3 - - import logging - import os - import time - import warnings - import zipfile - from pathlib import Path - from typing import Union - - warnings.filterwarnings("ignore", category=UserWarning) - - import cv2 - import matplotlib.pyplot as plt - import monai - import numpy as np - import torch - import nncf - import openvino as ov - from monai.transforms import LoadImage - from nncf.common.logging.logger import set_log_level - from torchmetrics import F1Score as F1 - import requests - - - set_log_level(logging.ERROR) # Disables all NNCF info and warning messages - - # Fetch `notebook_utils` module - - if not Path("notebook_utils.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file, device_widget - - if not Path("./custom_segmentation.py").exists(): - download_file(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/ct-segmentation-quantize/custom_segmentation.py") - from custom_segmentation import SegmentationModel - - if not Path("./async_pipeline.py").exists(): - download_file(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/ct-segmentation-quantize/async_pipeline.py") - from async_pipeline import show_live_inference - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("ct-segmentation-quantize-nncf.ipynb") - - -.. parsed-literal:: - - 2025-02-03 23:51:10.773270: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2025-02-03 23:51:10.807480: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2025-02-03 23:51:11.401991: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -Settings --------- - - - -By default, this notebook will download one CT scan from the KITS19 -dataset that will be used for quantization. - -.. code:: ipython3 - - BASEDIR = Path("kits19_frames_1") - # Uncomment the line below to use the full dataset, as prepared in the data preparation notebook - # BASEDIR = Path("~/kits19/kits19_frames").expanduser() - MODEL_DIR = Path("model") - MODEL_DIR.mkdir(exist_ok=True) - -Load PyTorch Model ------------------- - - - -Download the pre-trained model weights, load the PyTorch model and the -``state_dict`` that was saved after training. The model used in this -notebook is a -`BasicUNet `__ -model from `MONAI `__. We provide a pre-trained -checkpoint. To see how this model performs, check out the `training -notebook `__. - -.. code:: ipython3 - - if not Path("pretrained_model/unet_kits19_state_dict.pth").exists(): - state_dict_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/kidney-segmentation-kits19/unet_kits19_state_dict.pth" - state_dict_file = download_file(state_dict_url, directory="pretrained_model") - state_dict = torch.load(state_dict_file, map_location=torch.device("cpu")) - - new_state_dict = {} - for k, v in state_dict.items(): - new_key = k.replace("_model.", "") - new_state_dict[new_key] = v - new_state_dict.pop("loss_function.pos_weight") - - model = monai.networks.nets.BasicUNet(spatial_dims=2, in_channels=1, out_channels=1).eval() - model.load_state_dict(new_state_dict) - - - -.. parsed-literal:: - - unet_kits19_state_dict.pth: 0%| | 0.00/7.58M [00:00 - - - -Download CT-scan Data ---------------------- - - - -.. code:: ipython3 - - # The CT scan case number. For example: 2 for data from the case_00002 directory - # Currently only 117 is supported - CASE = 117 - if not (BASEDIR / f"case_{CASE:05d}").exists(): - BASEDIR.mkdir(exist_ok=True) - filename = download_file(f"https://storage.openvinotoolkit.org/data/test_data/openvino_notebooks/kits19/case_{CASE:05d}.zip") - with zipfile.ZipFile(filename, "r") as zip_ref: - zip_ref.extractall(path=BASEDIR) - os.remove(filename) # remove zipfile - print(f"Downloaded and extracted data for case_{CASE:05d}") - else: - print(f"Data for case_{CASE:05d} exists") - - - -.. parsed-literal:: - - case_00117.zip: 0%| | 0.00/5.48M [00:00`__. - -Images are loaded with MONAI’s -`LoadImage `__, -to align with the image loading method in the training notebook. This -method rotates and flips the images. We define a ``rotate_and_flip`` -method to display the images in the expected orientation: - -.. code:: ipython3 - - def rotate_and_flip(image): - """Rotate `image` by 90 degrees and flip horizontally""" - return cv2.flip(cv2.rotate(image, rotateCode=cv2.ROTATE_90_CLOCKWISE), flipCode=1) - - - class KitsDataset: - def __init__(self, basedir: str): - """ - Dataset class for prepared Kits19 data, for binary segmentation (background/kidney) - Source data should exist in basedir, in subdirectories case_00000 until case_00210, - with each subdirectory containing directories imaging_frames, with jpg images, and - segmentation_frames with segmentation masks as png files. - See [data-preparation-ct-scan](./data-preparation-ct-scan.ipynb) - - :param basedir: Directory that contains the prepared CT scans - """ - masks = sorted(BASEDIR.glob("case_*/segmentation_frames/*png")) - - self.basedir = basedir - self.dataset = masks - print(f"Created dataset with {len(self.dataset)} items. " f"Base directory for data: {basedir}") - - def __getitem__(self, index): - """ - Get an item from the dataset at the specified index. - - :return: (image, segmentation_mask) - """ - mask_path = self.dataset[index] - image_path = str(mask_path.with_suffix(".jpg")).replace("segmentation_frames", "imaging_frames") - - # Load images with MONAI's LoadImage to match data loading in training notebook - mask = LoadImage(image_only=True, dtype=np.uint8)(str(mask_path)).numpy() - img = LoadImage(image_only=True, dtype=np.float32)(str(image_path)).numpy() - - if img.shape[:2] != (512, 512): - img = cv2.resize(img.astype(np.uint8), (512, 512)).astype(np.float32) - mask = cv2.resize(mask, (512, 512)) - - input_image = np.expand_dims(img, axis=0) - return input_image, mask - - def __len__(self): - return len(self.dataset) - -To test whether the data loader returns the expected output, we show an -image and a mask. The image and the mask are returned by the dataloader, -after resizing and preprocessing. Since this dataset contains a lot of -slices without kidneys, we select a slice that contains at least 5000 -kidney pixels to verify that the annotations look correct: - -.. code:: ipython3 - - dataset = KitsDataset(BASEDIR) - # Find a slice that contains kidney annotations - # item[0] is the annotation: (id, annotation_data) - image_data, mask = next(item for item in dataset if np.count_nonzero(item[1]) > 5000) - # Remove extra image dimension and rotate and flip the image for visualization - image = rotate_and_flip(image_data.squeeze()) - - # The data loader returns annotations as (index, mask) and mask in shape (H,W) - mask = rotate_and_flip(mask) - - fig, ax = plt.subplots(1, 2, figsize=(12, 6)) - ax[0].imshow(image, cmap="gray") - ax[1].imshow(mask, cmap="gray"); - - -.. parsed-literal:: - - Created dataset with 69 items. Base directory for data: kits19_frames_1 - - - -.. image:: ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_14_1.png - - -Metric -~~~~~~ - - - -Define a metric to determine the performance of the model. - -For this demo, we use the `F1 -score `__, or Dice coefficient, -from the -`TorchMetrics `__ -library. - -.. code:: ipython3 - - def compute_f1(model: Union[torch.nn.Module, ov.CompiledModel], dataset: KitsDataset): - """ - Compute binary F1 score of `model` on `dataset` - F1 score metric is provided by the torchmetrics library - `model` is expected to be a binary segmentation model, images in the - dataset are expected in (N,C,H,W) format where N==C==1 - """ - metric = F1(ignore_index=0, task="binary", average="macro") - with torch.no_grad(): - for image, target in dataset: - input_image = torch.as_tensor(image).unsqueeze(0) - if isinstance(model, ov.CompiledModel): - output_layer = model.output(0) - output = model(input_image)[output_layer] - output = torch.from_numpy(output) - else: - output = model(input_image) - label = torch.as_tensor(target.squeeze()).long() - prediction = torch.sigmoid(output.squeeze()).round().long() - metric.update(label.flatten(), prediction.flatten()) - return metric.compute() - -Quantization ------------- - - - -Before quantizing the model, we compute the F1 score on the ``FP32`` -model, for comparison: - -.. code:: ipython3 - - fp32_f1 = compute_f1(model, dataset) - print(f"FP32 F1: {fp32_f1:.3f}") - - -.. parsed-literal:: - - FP32 F1: 0.999 - - -We convert the PyTorch model to OpenVINO IR and serialize it for -comparing the performance of the ``FP32`` and ``INT8`` model later in -this notebook. - -.. code:: ipython3 - - fp32_ir_path = MODEL_DIR / Path("unet_kits19_fp32.xml") - dummy_input = torch.randn(1, 1, 512, 512) - - fp32_ir_model = ov.convert_model(model, example_input=dummy_input, input=dummy_input.shape) - ov.save_model(fp32_ir_model, str(fp32_ir_path)) - - -.. parsed-literal:: - - WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11. - - -.. parsed-literal:: - - [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/monai/networks/nets/basic_unet.py:168: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if x_e.shape[-i - 1] != x_0.shape[-i - 1]: - - -`NNCF `__ provides a suite of -advanced algorithms for Neural Networks inference optimization in -OpenVINO with minimal accuracy drop. - - **Note**: NNCF Post-training Quantization is available in OpenVINO - 2023.0 release. - -Create a quantized model from the pre-trained ``FP32`` model and the -calibration dataset. The optimization process contains the following -steps: - -:: - - 1. Create a Dataset for quantization. - 2. Run `nncf.quantize` for getting an optimized model. - 3. Export the quantized model to OpenVINO IR model. - 4. Serialize the INT8 model using `ov.save_model` function for benchmarking. - -.. code:: ipython3 - - def transform_fn(data_item): - """ - Extract the model's input from the data item. - The data item here is the data item that is returned from the data source per iteration. - This function should be passed when the data item cannot be used as model's input. - """ - images, _ = data_item - return images - - - data_loader = torch.utils.data.DataLoader(dataset) - calibration_dataset = nncf.Dataset(data_loader, transform_fn) - quantized_model = nncf.quantize( - model, - calibration_dataset, - # Do not quantize LeakyReLU activations to allow the INT8 model to run on Intel GPU - ignored_scope=nncf.IgnoredScope(patterns=[".*LeakyReLU.*"]), - ) - - - -.. parsed-literal:: - - Output() - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - -Convert quantized model to OpenVINO IR model and save it. - -.. code:: ipython3 - - dummy_input = torch.randn(1, 1, 512, 512) - int8_ir_path = MODEL_DIR / "unet_kits19_int8.xml" - int8_ir_model = ov.convert_model(quantized_model, example_input=dummy_input, input=dummy_input.shape) - ov.save_model(int8_ir_model, str(int8_ir_path)) - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/quantization/layers.py:340: TracerWarning: Converting a tensor to a Python number might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - return self._level_low.item() - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/quantization/layers.py:348: TracerWarning: Converting a tensor to a Python number might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - return self._level_high.item() - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/monai/networks/nets/basic_unet.py:168: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if x_e.shape[-i - 1] != x_0.shape[-i - 1]: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error: - Tensor-likes are not close! - - Mismatched elements: 244841 / 262144 (93.4%) - Greatest absolute difference: 4.007030487060547 at index (0, 0, 464, 233) (up to 1e-05 allowed) - Greatest relative difference: 8038.314330448468 at index (0, 0, 354, 57) (up to 1e-05 allowed) - _check_trace( - - -This notebook demonstrates post-training quantization with NNCF. - -NNCF also supports quantization-aware training, and other algorithms -than quantization. See the `NNCF -documentation `__ in the NNCF -repository for more information. - -Compare FP32 and INT8 Model ---------------------------- - - - -Compare File Size -~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - fp32_ir_model_size = fp32_ir_path.with_suffix(".bin").stat().st_size / 1024 - quantized_model_size = int8_ir_path.with_suffix(".bin").stat().st_size / 1024 - - print(f"FP32 IR model size: {fp32_ir_model_size:.2f} KB") - print(f"INT8 model size: {quantized_model_size:.2f} KB") - - -.. parsed-literal:: - - FP32 IR model size: 3864.14 KB - INT8 model size: 1953.48 KB - - -Select Inference Device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - core = ov.Core() - # By default, benchmark on MULTI:CPU,GPU if a GPU is available, otherwise on CPU. - device_list = ["MULTI:CPU,GPU" if "GPU" in core.available_devices else "AUTO"] - - device = device_widget(device_list[0], added=device_list) - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Compare Metrics for the original model and the quantized model to be sure that there no degradation. -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - int8_compiled_model = core.compile_model(int8_ir_model, device.value) - int8_f1 = compute_f1(int8_compiled_model, dataset) - - print(f"FP32 F1: {fp32_f1:.3f}") - print(f"INT8 F1: {int8_f1:.3f}") - - -.. parsed-literal:: - - FP32 F1: 0.999 - INT8 F1: 0.999 - - -Compare Performance of the FP32 IR Model and Quantized Models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To measure the inference performance of the ``FP32`` and ``INT8`` -models, we use `Benchmark -Tool `__ -- OpenVINO’s inference performance measurement tool. Benchmark tool is a -command line application, part of OpenVINO development tools, that can -be run in the notebook with ``! benchmark_app`` or -``%sx benchmark_app``. - - **NOTE**: For the most accurate performance estimation, it is - recommended to run ``benchmark_app`` in a terminal/command prompt - after closing other applications. Run - ``benchmark_app -m model.xml -d CPU`` to benchmark async inference on - CPU for one minute. Change ``CPU`` to ``GPU`` to benchmark on GPU. - Run ``benchmark_app --help`` to see all command line options. - -.. code:: ipython3 - - # ! benchmark_app --help - -.. code:: ipython3 - - # Benchmark FP32 model - ! benchmark_app -m $fp32_ir_path -d $device.value -t 15 -api sync - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.LATENCY. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 9.41 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,1,512,512] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.final_conv/aten::_convolution/Add) : f32 / [...] / [1,1,512,512] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [N,C,H,W] / [1,1,512,512] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.final_conv/aten::_convolution/Add) : f32 / [...] / [1,1,512,512] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 231.73 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.LATENCY - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: False - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 12 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 1 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1 - [ INFO ] PERFORMANCE_HINT: LATENCY - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference synchronously, limits: 15000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 56.93 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 422 iterations - [ INFO ] Duration: 15024.97 ms - [ INFO ] Latency: - [ INFO ] Median: 34.51 ms - [ INFO ] Average: 35.38 ms - [ INFO ] Min: 34.12 ms - [ INFO ] Max: 47.52 ms - [ INFO ] Throughput: 28.09 FPS - - -.. code:: ipython3 - - # Benchmark INT8 model - ! benchmark_app -m $int8_ir_path -d $device.value -t 15 -api sync - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.LATENCY. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 10.83 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,1,512,512] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.final_conv/aten::_convolution/Add) : f32 / [...] / [1,1,512,512] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [N,C,H,W] / [1,1,512,512] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.final_conv/aten::_convolution/Add) : f32 / [...] / [1,1,512,512] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 258.69 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model49 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.LATENCY - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: False - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 12 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model49 - [ INFO ] NUM_STREAMS: 1 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1 - [ INFO ] PERFORMANCE_HINT: LATENCY - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference synchronously, limits: 15000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 29.35 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 958 iterations - [ INFO ] Duration: 15000.71 ms - [ INFO ] Latency: - [ INFO ] Median: 15.41 ms - [ INFO ] Average: 15.46 ms - [ INFO ] Min: 15.10 ms - [ INFO ] Max: 17.20 ms - [ INFO ] Throughput: 63.86 FPS - - -Visually Compare Inference Results -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Visualize the results of the model on four slices of the validation set. -Compare the results of the ``FP32`` IR model with the results of the -quantized ``INT8`` model and the reference segmentation annotation. - -Medical imaging datasets tend to be very imbalanced: most of the slices -in a CT scan do not contain kidney data. The segmentation model should -be good at finding kidneys where they exist (in medical terms: have good -sensitivity) but also not find spurious kidneys that do not exist (have -good specificity). In the next cell, there are four slices: two slices -that have no kidney data, and two slices that contain kidney data. For -this example, a slice has kidney data if at least 50 pixels in the -slices are annotated as kidney. - -Run this cell again to show results on a different subset. The random -seed is displayed to enable reproducing specific runs of this cell. - - **NOTE**: the images are shown after optional augmenting and - resizing. In the Kits19 dataset all but one of the cases has the - ``(512, 512)`` input shape. - -.. code:: ipython3 - - # The sigmoid function is used to transform the result of the network - # to binary segmentation masks - def sigmoid(x): - return np.exp(-np.logaddexp(0, -x)) - - - num_images = 4 - colormap = "gray" - - # Load FP32 and INT8 models - core = ov.Core() - fp_model = core.read_model(fp32_ir_path) - int8_model = core.read_model(int8_ir_path) - compiled_model_fp = core.compile_model(fp_model, device_name=device.value) - compiled_model_int8 = core.compile_model(int8_model, device_name=device.value) - output_layer_fp = compiled_model_fp.output(0) - output_layer_int8 = compiled_model_int8.output(0) - - # Create subset of dataset - background_slices = (item for item in dataset if np.count_nonzero(item[1]) == 0) - kidney_slices = (item for item in dataset if np.count_nonzero(item[1]) > 50) - - background_slices_l = list(background_slices) - kidney_slices_l = list(kidney_slices) - if len(background_slices_l) != 0: - background_id = np.random.choice(len(background_slices_l), 2) - kidney_id = np.random.choice(len(kidney_slices_l), 2) - data_subset = [background_slices_l[idx] for idx in background_id] + [kidney_slices_l[idx] for idx in kidney_id] - else: - kidkey_id = np.random.choice(len(kidneyslices_l), 2) - data_subset = [kidney_slices_l[idx] for idx in kidney_id] - - # Set seed to current time. To reproduce specific results, copy the printed seed - # and manually set `seed` to that value. - seed = int(time.time()) - np.random.seed(seed) - print(f"Visualizing results with seed {seed}") - - fig, ax = plt.subplots(nrows=num_images, ncols=4, figsize=(24, num_images * 4)) - for i, (image, mask) in enumerate(data_subset): - display_image = rotate_and_flip(image.squeeze()) - target_mask = rotate_and_flip(mask).astype(np.uint8) - # Add batch dimension to image and do inference on FP and INT8 models - input_image = np.expand_dims(image, 0) - res_fp = compiled_model_fp([input_image]) - res_int8 = compiled_model_int8([input_image]) - - # Process inference outputs and convert to binary segementation masks - result_mask_fp = sigmoid(res_fp[output_layer_fp]).squeeze().round().astype(np.uint8) - result_mask_int8 = sigmoid(res_int8[output_layer_int8]).squeeze().round().astype(np.uint8) - result_mask_fp = rotate_and_flip(result_mask_fp) - result_mask_int8 = rotate_and_flip(result_mask_int8) - - # Display images, annotations, FP32 result and INT8 result - ax[i, 0].imshow(display_image, cmap=colormap) - ax[i, 1].imshow(target_mask, cmap=colormap) - ax[i, 2].imshow(result_mask_fp, cmap=colormap) - ax[i, 3].imshow(result_mask_int8, cmap=colormap) - ax[i, 2].set_title("Prediction on FP32 model") - ax[i, 3].set_title("Prediction on INT8 model") - - -.. parsed-literal:: - - Visualizing results with seed 1738623154 - - - -.. image:: ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_37_1.png - - -Show Live Inference -------------------- - - - -To show live inference on the model in the notebook, we will use the -asynchronous processing feature of OpenVINO. - -We use the ``show_live_inference`` function from `Notebook -Utils `__ to show live inference. This -function uses `Open Model -Zoo `__\ ’s Async -Pipeline and Model API to perform asynchronous inference. After -inference on the specified CT scan has completed, the total time and -throughput (fps), including preprocessing and displaying, will be -printed. - - **NOTE**: If you experience flickering on Firefox, consider using - Chrome or Edge to run this notebook. - -Load Model and List of Image Files -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We load the segmentation model to OpenVINO Runtime with -``SegmentationModel``, based on the `Open Model -Zoo `__ Model API. -This model implementation includes pre and post processing for the -model. For ``SegmentationModel``, this includes the code to create an -overlay of the segmentation mask on the original image/frame. - -.. code:: ipython3 - - CASE = 117 - - segmentation_model = SegmentationModel(ie=core, model_path=int8_ir_path, sigmoid=True, rotate_and_flip=True) - case_path = BASEDIR / f"case_{CASE:05d}" - image_paths = sorted(case_path.glob("imaging_frames/*jpg")) - print(f"{case_path.name}, {len(image_paths)} images") - - -.. parsed-literal:: - - case_00117, 69 images - - -Show Inference -~~~~~~~~~~~~~~ - - - -In the next cell, we run the ``show_live_inference`` function, which -loads the ``segmentation_model`` to the specified ``device`` (using -caching for faster model loading on GPU devices), loads the images, -performs inference, and displays the results on the frames loaded in -``images`` in real-time. - -.. code:: ipython3 - - reader = LoadImage(image_only=True, dtype=np.uint8) - show_live_inference( - ie=core, - image_paths=image_paths, - model=segmentation_model, - device=device.value, - reader=reader, - ) - - - -.. image:: ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_42_0.jpg - - -.. parsed-literal:: - - Loaded model to AUTO in 0.15 seconds. - Total time for 68 frames: 2.44 seconds, fps:28.29 - - -References ----------- - - - -**OpenVINO** - `NNCF -Repository `__ - `Neural -Network Compression Framework for fast model -inference `__ - `OpenVINO API -Tutorial `__ - `OpenVINO PyPI (pip -install openvino) `__ - -**Kits19 Data** - `Kits19 Challenge -Homepage `__ - `Kits19 GitHub -Repository `__ - `The KiTS19 -Challenge Data: 300 Kidney Tumor Cases with Clinical Context, CT -Semantic Segmentations, and Surgical -Outcomes `__ - `The state of the art -in kidney and kidney tumor segmentation in contrast-enhanced CT imaging: -Results of the KiTS19 -challenge `__ diff --git a/docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_14_1.png b/docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_14_1.png deleted file mode 100644 index d018cc7ac24163..00000000000000 --- a/docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_14_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f8a243947cb2e6e61c56f5971421b38d61ef231f122442e7e17517f811ffb643 -size 158997 diff --git a/docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_37_1.png b/docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_37_1.png deleted file mode 100644 index c75d33eb532747..00000000000000 --- a/docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_37_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ba4e183e1d22c63fc7a4fe53f11aa33391f0fe1747db083e712504a8e96b02cd -size 384665 diff --git a/docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_42_0.jpg b/docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_42_0.jpg deleted file mode 100644 index a7a3dbab21c740..00000000000000 --- a/docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_42_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7a32b8f52591f926f1cc9afda5f199e189afed948f93c4c01a486cc43f7bdfd9 -size 73812 diff --git a/docs/notebooks/ddcolor-image-colorization-with-output.rst b/docs/notebooks/ddcolor-image-colorization-with-output.rst deleted file mode 100644 index 105a890e01040e..00000000000000 --- a/docs/notebooks/ddcolor-image-colorization-with-output.rst +++ /dev/null @@ -1,734 +0,0 @@ -Colorize grayscale images using DDColor and OpenVINO -====================================================== - -Image colorization is the process of adding color to grayscale images. -Initially captured in black and white, these images are transformed into -vibrant, lifelike representations by estimating RGB colors. This -technology enhances both aesthetic appeal and perceptual quality. -Historically, artists manually applied colors to monochromatic -photographs, a painstaking task that could take up to a month for a -single image. However, with advancements in information technology and -the rise of deep neural networks, automated image colorization has -become increasingly important. - -DDColor is one of the most progressive methods of image colorization in -our days. It is a novel approach using dual decoders: a pixel decoder -and a query-based color decoder, that stands out in its ability to -produce photo-realistic colorization, particularly in complex scenes -with multiple objects and diverse contexts. |image0| - -More details about this approach can be found in original model -`repository `__ and -`paper `__. - -In this tutorial we consider how to convert and run DDColor using -OpenVINO. Additionally, we will demonstrate how to optimize this model -using `NNCF `__. - -🪄 Let’s start to explore magic of image colorization! - -**Table of contents:** - -- `Prerequisites <#prerequisites>`__ -- `Load PyTorch model <#load-pytorch-model>`__ -- `Run PyTorch model inference <#run-pytorch-model-inference>`__ -- `Convert PyTorch model to OpenVINO Intermediate - Representation <#convert-pytorch-model-to-openvino-intermediate-representation>`__ -- `Run OpenVINO model inference <#run-openvino-model-inference>`__ -- `Optimize OpenVINO model using - NNCF <#optimize-openvino-model-using-nncf>`__ - - - `Collect quantization dataset <#collect-quantization-dataset>`__ - - `Perform model quantization <#perform-model-quantization>`__ - -- `Run INT8 model inference <#run-int8-model-inference>`__ -- `Compare FP16 and INT8 model - size <#compare-fp16-and-int8-model-size>`__ -- `Compare inference time of the FP16 and INT8 - models <#compare-inference-time-of-the-fp16-and-int8-models>`__ -- `Interactive inference <#interactive-inference>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. |image0| image:: https://github.com/piddnad/DDColor/raw/master/assets/network_arch.jpg - -Prerequisites -------------- - - - -.. code:: ipython3 - - import platform - - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - - - %pip install -q "nncf>=2.11.0" "torch>=2.1" "torchvision" "timm" "opencv_python" "pillow" "PyYAML" "scipy" "scikit-image" "datasets" "gradio>=4.19" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -Uq "openvino>=2024.3.0" - - -.. parsed-literal:: - - ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. - tensorflow 2.13.1 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.12.2 which is incompatible. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - - -.. code:: ipython3 - - from pathlib import Path - import requests - - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("ddcolor-image-colorization.ipynb") - -.. code:: ipython3 - - from cmd_helper import clone_repo - - - clone_repo("https://github.com/piddnad/DDColor.git") - - - - -.. parsed-literal:: - - PosixPath('DDColor') - - - -.. code:: ipython3 - - if Path("DDColor/inference").exists(): - try: - from inference.colorization_pipeline_hf import DDColorHF, ImageColorizationPipelineHF - except Exception: - from inference.colorization_pipeline_hf import DDColorHF, ImageColorizationPipelineHF - else: - try: - from infer_hf import DDColorHF, ImageColorizationPipelineHF - except Exception: - from infer_hf import DDColorHF, ImageColorizationPipelineHF - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers - warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning) - - -Load PyTorch model ------------------- - - - -There are several models from DDColor’s family provided in `model -repository `__. -We will use DDColor-T, the most lightweight version of ddcolor model, -but demonstrated in the tutorial steps are also applicable to other -models from DDColor family. - -.. code:: ipython3 - - import torch - - model_name = "ddcolor_paper_tiny" - - ddcolor_model = DDColorHF.from_pretrained(f"piddnad/{model_name}") - - - colorizer = ImageColorizationPipelineHF(model=ddcolor_model, input_size=512) - - ddcolor_model.to("cpu") - colorizer.device = torch.device("cpu") - -Run PyTorch model inference ---------------------------- - - - -.. code:: ipython3 - - import cv2 - import PIL - - IMG_PATH = "DDColor/assets/test_images/Ansel Adams _ Moore Photography.jpeg" - - - img = cv2.imread(IMG_PATH) - - PIL.Image.fromarray(img[:, :, ::-1]) - - - - -.. image:: ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_9_0.png - - - -.. code:: ipython3 - - image_out = colorizer.process(img) - PIL.Image.fromarray(image_out[:, :, ::-1]) - - - - -.. image:: ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_10_0.png - - - -Convert PyTorch model to OpenVINO Intermediate Representation -------------------------------------------------------------- - - - -OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate -Representation (IR). OpenVINO model conversion API should be used for -these purposes. ``ov.convert_model`` function accepts original PyTorch -model instance and example input for tracing and returns ``ov.Model`` -representing this model in OpenVINO framework. Converted model can be -used for saving on disk using ``ov.save_model`` function or directly -loading on device using ``core.complie_model``. - -.. code:: ipython3 - - import openvino as ov - import torch - - OV_COLORIZER_PATH = Path("ddcolor.xml") - - if not OV_COLORIZER_PATH.exists(): - ov_model = ov.convert_model(ddcolor_model, example_input=torch.ones((1, 3, 512, 512)), input=[1, 3, 512, 512]) - ov.save_model(ov_model, OV_COLORIZER_PATH) - -Run OpenVINO model inference ----------------------------- - - - -Select one of supported devices for inference using dropdown list. - -.. code:: ipython3 - - from notebook_utils import device_widget - - core = ov.Core() - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - compiled_model = core.compile_model(OV_COLORIZER_PATH, device.value) - -.. code:: ipython3 - - import cv2 - import numpy as np - import torch - import torch.nn.functional as F - - - def process(img, compiled_model): - # Preprocess input image - height, width = img.shape[:2] - - # Normalize to [0, 1] range - img = (img / 255.0).astype(np.float32) - orig_l = cv2.cvtColor(img, cv2.COLOR_BGR2Lab)[:, :, :1] # (h, w, 1) - - # Resize rgb image -> lab -> get grey -> rgb - img = cv2.resize(img, (512, 512)) - img_l = cv2.cvtColor(img, cv2.COLOR_BGR2Lab)[:, :, :1] - img_gray_lab = np.concatenate((img_l, np.zeros_like(img_l), np.zeros_like(img_l)), axis=-1) - img_gray_rgb = cv2.cvtColor(img_gray_lab, cv2.COLOR_LAB2RGB) - - # Transpose HWC -> CHW and add batch dimension - tensor_gray_rgb = torch.from_numpy(img_gray_rgb.transpose((2, 0, 1))).float().unsqueeze(0) - - # Run model inference - output_ab = compiled_model(tensor_gray_rgb)[0] - - # Postprocess result - # resize ab -> concat original l -> rgb - output_ab_resize = F.interpolate(torch.from_numpy(output_ab), size=(height, width))[0].float().numpy().transpose(1, 2, 0) - output_lab = np.concatenate((orig_l, output_ab_resize), axis=-1) - output_bgr = cv2.cvtColor(output_lab, cv2.COLOR_LAB2BGR) - - output_img = (output_bgr * 255.0).round().astype(np.uint8) - - return output_img - -.. code:: ipython3 - - ov_processed_img = process(img, compiled_model) - PIL.Image.fromarray(ov_processed_img[:, :, ::-1]) - - - - -.. image:: ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_17_0.png - - - -Optimize OpenVINO model using NNCF ----------------------------------- - - - -`NNCF `__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -The optimization process contains the following steps: - -1. Create a calibration dataset for quantization. -2. Run ``nncf.quantize()`` to obtain quantized model. -3. Save the ``INT8`` model using ``openvino.save_model()`` function. - -Please select below whether you would like to run quantization to -improve model inference speed. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - to_quantize = quantization_widget() - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -.. code:: ipython3 - - import requests - - OV_INT8_COLORIZER_PATH = Path("ddcolor_int8.xml") - compiled_int8_model = None - - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - %load_ext skip_kernel_extension - -Collect quantization dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We use a portion of -`ummagumm-a/colorization_dataset `__ -dataset from Hugging Face as calibration data. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from datasets import load_dataset - - subset_size = 300 - calibration_data = [] - - if not OV_INT8_COLORIZER_PATH.exists(): - dataset = load_dataset("ummagumm-a/colorization_dataset", split="train", streaming=True).shuffle(seed=42).take(subset_size) - for idx, batch in enumerate(dataset): - if idx >= subset_size: - break - img = np.array(batch["conditioning_image"]) - img = (img / 255.0).astype(np.float32) - img = cv2.resize(img, (512, 512)) - img_l = cv2.cvtColor(np.stack([img, img, img], axis=2), cv2.COLOR_BGR2Lab)[:, :, :1] - img_gray_lab = np.concatenate((img_l, np.zeros_like(img_l), np.zeros_like(img_l)), axis=-1) - img_gray_rgb = cv2.cvtColor(img_gray_lab, cv2.COLOR_LAB2RGB) - - image = np.expand_dims(img_gray_rgb.transpose((2, 0, 1)).astype(np.float32), axis=0) - calibration_data.append(image) - -Perform model quantization -~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import nncf - - if not OV_INT8_COLORIZER_PATH.exists(): - ov_model = core.read_model(OV_COLORIZER_PATH) - quantized_model = nncf.quantize( - model=ov_model, - subset_size=subset_size, - calibration_dataset=nncf.Dataset(calibration_data), - ) - ov.save_model(quantized_model, OV_INT8_COLORIZER_PATH) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -.. parsed-literal:: - - 2025-02-03 23:55:05.499380: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2025-02-03 23:55:05.538709: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2025-02-03 23:55:05.951653: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - - - -.. parsed-literal:: - - Output() - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - -Run INT8 model inference ------------------------- - - - -.. code:: ipython3 - - from IPython.display import display - - if OV_INT8_COLORIZER_PATH.exists(): - compiled_int8_model = core.compile_model(OV_INT8_COLORIZER_PATH, device.value) - img = cv2.imread("DDColor/assets/test_images/Ansel Adams _ Moore Photography.jpeg") - img_out = process(img, compiled_int8_model) - display(PIL.Image.fromarray(img_out[:, :, ::-1])) - - - -.. image:: ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_26_0.png - - -Compare FP16 and INT8 model size --------------------------------- - - - -.. code:: ipython3 - - fp16_ir_model_size = OV_COLORIZER_PATH.with_suffix(".bin").stat().st_size / 2**20 - - print(f"FP16 model size: {fp16_ir_model_size:.2f} MB") - - if OV_INT8_COLORIZER_PATH.exists(): - quantized_model_size = OV_INT8_COLORIZER_PATH.with_suffix(".bin").stat().st_size / 2**20 - print(f"INT8 model size: {quantized_model_size:.2f} MB") - print(f"Model compression rate: {fp16_ir_model_size / quantized_model_size:.3f}") - - -.. parsed-literal:: - - FP16 model size: 104.89 MB - INT8 model size: 52.97 MB - Model compression rate: 1.980 - - -Compare inference time of the FP16 and INT8 models --------------------------------------------------- - - - -To measure the inference performance of OpenVINO FP16 and INT8 models, -use `Benchmark -Tool `__. - - **NOTE**: For the most accurate performance estimation, it is - recommended to run ``benchmark_app`` in a terminal/command prompt - after closing other applications. - -.. code:: ipython3 - - !benchmark_app -m $OV_COLORIZER_PATH -d $device.value -api async -shape "[1,3,512,512]" -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 42.87 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,3,512,512] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.refine_net.0.0/aten::_convolution/Add) : f32 / [...] / [1,2,512,512] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,512,512] - [ INFO ] Reshape model took 0.04 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,512,512] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.refine_net.0.0/aten::_convolution/Add) : f32 / [...] / [1,2,512,512] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 1292.95 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 6 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 24 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 6 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 6 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 6 inference requests, limits: 15000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 546.86 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 72 iterations - [ INFO ] Duration: 16123.94 ms - [ INFO ] Latency: - [ INFO ] Median: 1337.94 ms - [ INFO ] Average: 1322.61 ms - [ INFO ] Min: 901.39 ms - [ INFO ] Max: 1725.13 ms - [ INFO ] Throughput: 4.47 FPS - - -.. code:: ipython3 - - if OV_INT8_COLORIZER_PATH.exists(): - !benchmark_app -m $OV_INT8_COLORIZER_PATH -d $device.value -api async -shape "[1,3,512,512]" -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 69.28 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,3,512,512] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.refine_net.0.0/aten::_convolution/Add) : f32 / [...] / [1,2,512,512] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,512,512] - [ INFO ] Reshape model took 0.04 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,512,512] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.refine_net.0.0/aten::_convolution/Add) : f32 / [...] / [1,2,512,512] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 2213.17 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 6 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 24 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 6 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 6 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 6 inference requests, limits: 15000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 275.90 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 156 iterations - [ INFO ] Duration: 15504.26 ms - [ INFO ] Latency: - [ INFO ] Median: 592.20 ms - [ INFO ] Average: 590.64 ms - [ INFO ] Min: 357.21 ms - [ INFO ] Max: 685.31 ms - [ INFO ] Throughput: 10.06 FPS - - -Interactive inference ---------------------- - - - -.. code:: ipython3 - - def generate(image, use_int8=True): - image_in = cv2.imread(image) - image_out = process(image_in, compiled_model if not use_int8 else compiled_int8_model) - image_out_pil = PIL.Image.fromarray(cv2.cvtColor(image_out, cv2.COLOR_BGR2RGB)) - return image_out_pil - - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/ddcolor-image-colorization/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=generate, quantized=compiled_int8_model is not None) - - try: - demo.queue().launch(debug=False) - except Exception: - demo.queue().launch(share=True, debug=False) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ - - -.. parsed-literal:: - - Running on local URL: http://127.0.0.1:7860 - - To create a public link, set `share=True` in `launch()`. - - - - - - - diff --git a/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_10_0.jpg b/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_10_0.jpg deleted file mode 100644 index 77e9e43ffd6303..00000000000000 --- a/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_10_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4a92ff02af12364a2b15169f96657de93c6a172e20174f7b8a857578d58405f9 -size 164917 diff --git a/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_10_0.png b/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_10_0.png deleted file mode 100644 index 5e84b839762374..00000000000000 --- a/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_10_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:82894174823bc19a70eb250cf7238d542f2d81127e6bd3760d210c94796f2968 -size 1497540 diff --git a/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_17_0.jpg b/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_17_0.jpg deleted file mode 100644 index 4941cb1f61198d..00000000000000 --- a/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_17_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e152195f834e9ac26e7df50e12c80cf13ee8d747e62dd29edf85b6fabb98cf84 -size 164898 diff --git a/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_17_0.png b/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_17_0.png deleted file mode 100644 index df05154ccae067..00000000000000 --- a/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_17_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f5d2b934619e33836937b6f9fe32987a37ca7506ecaf7d044d8286b6212bebe5 -size 1497558 diff --git a/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_26_0.jpg b/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_26_0.jpg deleted file mode 100644 index 04bff7b94f0298..00000000000000 --- a/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_26_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88965d29c3f9adf29bd2de861f9dedcfd7c44ddf8674b38107f488667aa73ea5 -size 167158 diff --git a/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_26_0.png b/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_26_0.png deleted file mode 100644 index f825a9fa52d819..00000000000000 --- a/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_26_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d22ee20fb5fef1b613af92d46e4af263846db9d9ced5db7c0d0213dcafde2b65 -size 1522093 diff --git a/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_9_0.jpg b/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_9_0.jpg deleted file mode 100644 index 8fb53b4e8d0997..00000000000000 --- a/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_9_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:614b930200ec51d4631b7c2a3cc743e18a8f3f94c3ea47673ef1fca37028af31 -size 152058 diff --git a/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_9_0.png b/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_9_0.png deleted file mode 100644 index d28e544903e803..00000000000000 --- a/docs/notebooks/ddcolor-image-colorization-with-output_files/ddcolor-image-colorization-with-output_9_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0c105b02bbf0b3c82c253b6b4e92d7b28518775279197cbf239ca40dd52c9e9e -size 792834 diff --git a/docs/notebooks/deepseek-r1-with-output.rst b/docs/notebooks/deepseek-r1-with-output.rst deleted file mode 100644 index 47176392fcc5ca..00000000000000 --- a/docs/notebooks/deepseek-r1-with-output.rst +++ /dev/null @@ -1,500 +0,0 @@ -LLM reasoning with DeepSeek-R1 distilled models -=============================================== - -`DeepSeek-R1 `__ -is an open-source reasoning model developed by DeepSeek to address tasks -requiring logical inference, mathematical problem-solving, and real-time -decision-making. With DeepSeek-R1, you can follow its logic, making it -easier to understand and, if necessary, challenge its output. This -capability gives reasoning models an edge in fields where outcomes need -to be explainable, like research or complex decision-making. - -Distillation in AI creates smaller, more efficient models from larger -ones, preserving much of their reasoning power while reducing -computational demands. DeepSeek applied this technique to create a suite -of distilled models from R1, using Qwen and Llama architectures. That -allows us to try DeepSeek-R1 capability locally on usual laptops. - -In this tutorial, we consider how to run DeepSeek-R1 distilled models -using OpenVINO. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Select model for inference <#select-model-for-inference>`__ -- `Convert model using Optimum-CLI - tool <#convert-model-using-optimum-cli-tool>`__ - - - `Weights Compression using - Optimum-CLI <#weights-compression-using-optimum-cli>`__ - -- `Instantiate pipeline with OpenVINO Generate - API <#instantiate-pipeline-with-openvino-generate-api>`__ -- `Run Chatbot <#run-chatbot>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Install required dependencies - -.. code:: ipython3 - - import os - import platform - - os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - - %pip install -q -U "openvino>=2024.6.0" openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu\ - "git+https://github.com/huggingface/optimum-intel.git"\ - "nncf==2.14.1"\ - "torch>=2.1"\ - "datasets" \ - "accelerate" \ - "gradio>=4.19" \ - "transformers>=4.43.1" \ - "huggingface-hub>=0.26.5" \ - "einops" "tiktoken" - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("llm_config.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("llm_config.py", "w").write(r.text) - - if not Path("notebook_utils.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("deepseek-r1.ipynb") - -Select model for inference --------------------------- - - - -The tutorial supports different models, you can select one from the -provided options to compare the quality of LLM solutions: - -- **DeepSeek-R1-Distill-Llama-8B** is a distilled model based on - `Llama-3.1-8B `__, - that prioritizes high performance and advanced reasoning - capabilities, particularly excelling in tasks requiring mathematical - and factual precision. Check `model - card `__ - for more info. -- **DeepSeek-R1-Distill-Qwen-1.5B** is the smallest DeepSeek-R1 - distilled model based on - `Qwen2.5-Math-1.5B `__. - Despite its compact size, the model demonstrates strong capabilities - in solving basic mathematical tasks, at the same time its programming - capabilities are limited. Check `model - card `__ - for more info. -- **DeepSeek-R1-Distill-Qwen-7B** is a distilled model based on - `Qwen-2.5-Math-7B `__. - The model demonstrates a good balance between mathematical and - factual reasoning and can be less suited for complex coding tasks. - Check `model - card `__ - for more info. -- **DeepSeek-R1-Distil-Qwen-14B** is a distilled model based on - `Qwen2.5-14B `__ that has - great competence in factual reasoning and solving complex - mathematical tasks. Check `model - card `__ - for more info. - -`Weight -compression `__ -is a technique for enhancing the efficiency of models, especially those -with large memory requirements. This method reduces the model’s memory -footprint, a crucial factor for Large Language Models (LLMs). We provide -several options for model weight compression: - -- **FP16** reducing model binary size on disk using ``save_model`` with - enabled compression weights to FP16 precision. This approach is - available in OpenVINO from scratch and is the default behavior. -- **INT8** is an 8-bit weight-only quantization provided by - `NNCF `__: This method - compresses weights to an 8-bit integer data type, which balances - model size reduction and accuracy, making it a versatile option for a - broad range of applications. -- **INT4** is an 4-bit weight-only quantization provided by - `NNCF `__. involves - quantizing weights to an unsigned 4-bit integer symmetrically around - a fixed zero point of eight (i.e., the midpoint between zero and 15). - in case of **symmetric quantization** or asymmetrically with a - non-fixed zero point, in case of **asymmetric quantization** - respectively. Compared to INT8 compression, INT4 compression improves - performance even more, but introduces a minor drop in prediction - quality. INT4 it ideal for situations where speed is prioritized over - an acceptable trade-off against accuracy. -- **INT4 AWQ** is an 4-bit activation-aware weight quantization. - `Activation-aware Weight - Quantization `__ (AWQ) is an - algorithm that tunes model weights for more accurate INT4 - compression. It slightly improves generation quality of compressed - LLMs, but requires significant additional time for tuning weights on - a calibration dataset. We will use ``wikitext-2-raw-v1/train`` subset - of the - `Wikitext `__ - dataset for calibration. -- **INT4 NPU-friendly** is an 4-bit channel-wise quantization. This - approach is - `recommended `__ - for LLM inference using NPU. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget(default="CPU") - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -.. code:: ipython3 - - from llm_config import get_llm_selection_widget - - form, lang, model_id_widget, compression_variant, _ = get_llm_selection_widget(device=device.value) - - form - - - - -.. parsed-literal:: - - Box(children=(Box(children=(Label(value='Language:'), Dropdown(options=('English', 'Chinese'), value='English'… - - - -.. code:: ipython3 - - model_configuration = model_id_widget.value - model_id = model_id_widget.label - print(f"Selected model {model_id} with {compression_variant.value} compression") - - -.. parsed-literal:: - - Selected model DeepSeek-R1-Distill-Llama-8B with INT4 compression - - -Convert model using Optimum-CLI tool ------------------------------------- - - - -`Optimum Intel `__ -is the interface between the -`Transformers `__ and -`Diffusers `__ libraries -and OpenVINO to accelerate end-to-end pipelines on Intel architectures. -It provides ease-to-use cli interface for exporting models to `OpenVINO -Intermediate Representation -(IR) `__ -format. - -.. raw:: html - -
- -.. raw:: html - - - -Click here to read more about Optimum CLI usage - -.. raw:: html - - - -The command bellow demonstrates basic command for model export with -``optimum-cli`` - -:: - - optimum-cli export openvino --model --task - -where ``--model`` argument is model id from HuggingFace Hub or local -directory with model (saved using ``.save_pretrained`` method), -``--task`` is one of `supported -task `__ -that exported model should solve. For LLMs it is recommended to use -``text-generation-with-past``. If model initialization requires to use -remote code, ``--trust-remote-code`` flag additionally should be passed. - -.. raw:: html - -
- -Weights Compression using Optimum-CLI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -You can also apply fp16, 8-bit or 4-bit weight compression on the -Linear, Convolutional and Embedding layers when exporting your model -with the CLI. - -.. raw:: html - -
- -.. raw:: html - - - -Click here to read more about weights compression with Optimum CLI - -.. raw:: html - - - -Setting ``--weight-format`` to respectively fp16, int8 or int4. This -type of optimization allows to reduce the memory footprint and inference -latency. By default the quantization scheme for int8/int4 will be -`asymmetric `__, -to make it -`symmetric `__ -you can add ``--sym``. - -For INT4 quantization you can also specify the following arguments : - -- The ``--group-size`` parameter will define the group size to use for -quantization, -1 it will results in per-column quantization. -- The ``--ratio`` parameter controls the ratio between 4-bit and 8-bit -quantization. If set to 0.9, it means that 90% of the layers will be -quantized to int4 while 10% will be quantized to int8. - -Smaller group_size and ratio values usually improve accuracy at the -sacrifice of the model size and inference latency. You can enable AWQ to -be additionally applied during model export with INT4 precision using -``--awq`` flag and providing dataset name with ``--dataset``\ parameter -(e.g. ``--dataset wikitext2``) - - **Note**: Applying AWQ requires significant memory and time. - -.. - - **Note**: It is possible that there will be no matching patterns in - the model to apply AWQ, in such case it will be skipped. - -.. raw:: html - -
- -.. code:: ipython3 - - from llm_config import convert_and_compress_model - - model_dir = convert_and_compress_model(model_id, model_configuration, compression_variant.value) - - -.. parsed-literal:: - - ✅ INT4 DeepSeek-R1-Distill-Llama-8B model already converted and can be found in DeepSeek-R1-Distill-Llama-8B/INT4_compressed_weights - - -.. code:: ipython3 - - from llm_config import compare_model_size - - compare_model_size(model_dir) - - -.. parsed-literal:: - - Size of model with INT4 compressed weights is 5081.91 MB - - -Instantiate pipeline with OpenVINO Generate API ------------------------------------------------ - - - -`OpenVINO Generate -API `__ -can be used to create pipelines to run an inference with OpenVINO -Runtime. - -Firstly we need to create a pipeline with ``LLMPipeline``. -``LLMPipeline`` is the main object used for text generation using LLM in -OpenVINO GenAI API. You can construct it straight away from the folder -with the converted model. We will provide directory with model and -device for ``LLMPipeline``. Then we run ``generate`` method and get the -output in text format. Additionally, we can configure parameters for -decoding. We can create the default config with -``ov_genai.GenerationConfig()``, setup parameters, and apply the updated -version with ``set_generation_config(config)`` or put config directly to -``generate()``. It’s also possible to specify the needed options just as -inputs in the ``generate()`` method, as shown below, e.g. we can add -``max_new_tokens`` to stop generation if a specified number of tokens is -generated and the end of generation is not reached. We will discuss some -of the available generation parameters more deeply later. Generation -process for long response may be time consuming, for accessing partial -result as soon as it is generated without waiting when whole process -finished, Streaming API can be used. Token streaming is the mode in -which the generative system returns the tokens one by one as the model -generates them. This enables showing progressive generations to the user -rather than waiting for the whole generation. Streaming is an essential -aspect of the end-user experience as it reduces latency, one of the most -critical aspects of a smooth experience. In code below, we implement -simple streamer for printing output result. For more advanced streamer -example please check openvino.genai -`sample `__. - -.. code:: ipython3 - - import openvino_genai as ov_genai - import sys - - print(f"Loading model from {model_dir}\n") - - - pipe = ov_genai.LLMPipeline(str(model_dir), device.value) - if "genai_chat_template" in model_configuration: - pipe.get_tokenizer().set_chat_template(model_configuration["genai_chat_template"]) - - generation_config = ov_genai.GenerationConfig() - generation_config.max_new_tokens = 128 - - - def streamer(subword): - print(subword, end="", flush=True) - sys.stdout.flush() - # Return flag corresponds whether generation should be stopped. - # False means continue generation. - return False - - - input_prompt = "What is OpenVINO?" - print(f"Input text: {input_prompt}") - result = pipe.generate(input_prompt, generation_config, streamer) - - -.. parsed-literal:: - - Loading model from DeepSeek-R1-Distill-Llama-8B/INT4_compressed_weights - - Input text: What is OpenVINO? - It's an open-source model optimization tool that accelerates AI deployment across various platforms. It supports multiple frameworks and platforms, providing tools for quantization, pruning, and knowledge distillation. OpenVINO is designed to help developers reduce the computational requirements of AI models, making them more efficient and deployable on resource-constrained environments. - - What is OpenVINO? It's an open-source model optimization tool that accelerates AI deployment across various platforms. It supports multiple frameworks and platforms, providing tools for quantization, pruning, and knowledge distillation. OpenVINO is designed to help developers reduce the computational requirements of AI models, making them more - -Run Chatbot ------------ - - - -Now, when model created, we can setup Chatbot interface using -`Gradio `__. - -.. raw:: html - -
- -.. raw:: html - - - -Click here to see how pipeline works - -.. raw:: html - - - -The diagram below illustrates how the chatbot pipeline works - -.. figure:: https://github.com/user-attachments/assets/9c9b56e1-01a6-48d8-aa46-222a88e25066 - :alt: llm_diagram - - llm_diagram - -As you can see, user input question passed via tokenizer to apply -chat-specific formatting (chat template) and turn the provided string -into the numeric format. `OpenVINO -Tokenizers `__ -are used for these purposes inside ``LLMPipeline``. You can find more -detailed info about tokenization theory and OpenVINO Tokenizers in this -`tutorial `__. -Then tokenized input passed to LLM for making prediction of next token -probability. The way the next token will be selected over predicted -probabilities is driven by the selected decoding methodology. You can -find more information about the most popular decoding methods in this -`blog `__. The sampler’s -goal is to select the next token id is driven by generation -configuration. Next, we apply stop generation condition to check the -generation is finished or not (e.g. if we reached the maximum new -generated tokens or the next token id equals to end of the generation). -If the end of the generation is not reached, then new generated token id -is used as the next iteration input, and the generation cycle repeats -until the condition is not met. When stop generation criteria are met, -then OpenVINO Detokenizer decodes generated token ids to text answer. - -The difference between chatbot and instruction-following pipelines is -that the model should have “memory” to find correct answers on the chain -of connected questions. OpenVINO GenAI uses ``KVCache`` representation -for maintain a history of conversation. By default, ``LLMPipeline`` -resets ``KVCache`` after each ``generate`` call. To keep conversational -history, we should move LLMPipeline to chat mode using ``start_chat()`` -method. - -More info about OpenVINO LLM inference can be found in `LLM Inference -Guide `__ - -.. raw:: html - -
- -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/deepseek-r1/gradio_helper.py") - open("gradio_helper_genai.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(pipe, model_configuration, model_id, lang.value, device.value == "NPU") - - try: - demo.launch(debug=True) - except Exception: - demo.launch(debug=True, share=True) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ diff --git a/docs/notebooks/depth-anything-v2-with-output.rst b/docs/notebooks/depth-anything-v2-with-output.rst deleted file mode 100644 index 58c2fe9351eaaa..00000000000000 --- a/docs/notebooks/depth-anything-v2-with-output.rst +++ /dev/null @@ -1,1122 +0,0 @@ -Depth estimation with DepthAnythingV2 and OpenVINO -================================================== - -`Depth Anything V2 `__ is a -solution for robust relative depth estimation. Without pursuing fancy -techniques, this project aim to reveal crucial findings to pave the way -towards building a powerful monocular depth estimation model. This model -is an improvement of the `Depth Anything -V1 `__. Notably, compared with V1, -this version produces much finer and more robust depth predictions -through three key practices: replacing all labeled real images with -synthetic images, scaling up the capacity of our teacher model and -teaching student models via the bridge of large-scale pseudo-labeled -real images. - -The pipeline of training of Depth Anything V2 is shown below. It -consists of three steps: train a reliable teacher model purely on -high-quality synthetic image; produce precise pseudo depth on -large-scale unlabeled real images; train final student models on -pseudo-labeled real images for robust generalization. - -.. figure:: https://depth-anything-v2.github.io/static/images/pipeline.png - :alt: image.png - - image.png - -More details about model can be found in `project web -page `__, -`paper `__ and official -`repository `__ - -In this tutorial we will explore how to convert and run DepthAnythingV2 -using OpenVINO. An additional part demonstrates how to run quantization -with `NNCF `__ to speed up the -model. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Load and run PyTorch model <#load-and-run-pytorch-model>`__ - - - `Prepare input data <#prepare-input-data>`__ - - `Run model inference <#run-model-inference>`__ - -- `Convert Model to OpenVINO IR - format <#convert-model-to-openvino-ir-format>`__ -- `Run OpenVINO model inference <#run-openvino-model-inference>`__ - - - `Select inference device <#select-inference-device>`__ - - `Run inference on image <#run-inference-on-image>`__ - - `Run inference on video <#run-inference-on-video>`__ - -- `Quantization <#quantization>`__ - - - `Prepare calibration dataset <#prepare-calibration-dataset>`__ - - `Run quantization <#run-quantization>`__ - - - `Compare model file size <#compare-model-file-size>`__ - - - `Compare inference time of the FP16 and INT8 - models <#compare-inference-time-of-the-fp16-and-int8-models>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import requests - from pathlib import Path - - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("depth-anything-v2.ipynb") - -.. code:: ipython3 - - from cmd_helper import clone_repo - - - clone_repo("https://huggingface.co/spaces/depth-anything/Depth-Anything-V2") - - - - -.. parsed-literal:: - - PosixPath('Depth-Anything-V2') - - - -.. code:: ipython3 - - import platform - - - %pip install -q "openvino>=2024.2.0" "datasets>=2.14.6" "nncf>=2.11.0" "tqdm" "matplotlib>=3.4" - %pip install -q "typing-extensions>=4.9.0" eval-type-backport "gradio>=4.19" gradio_imageslider - %pip install -q torch torchvision "opencv-python" huggingface_hub --extra-index-url https://download.pytorch.org/whl/cpu - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - if platform.python_version_tuple()[1] in ["8", "9"]: - %pip install -q "gradio-imageslider<=0.0.17" "typing-extensions>=4.9.0" - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - - -Load and run PyTorch model --------------------------- - - - -To be able run PyTorch model on CPU, we should disable xformers -attention optimizations first. - -.. code:: ipython3 - - attention_file_path = Path("./Depth-Anything-V2/depth_anything_v2/dinov2_layers/attention.py") - orig_attention_path = attention_file_path.parent / ("orig_" + attention_file_path.name) - - if not orig_attention_path.exists(): - attention_file_path.rename(orig_attention_path) - - with orig_attention_path.open("r") as f: - data = f.read() - data = data.replace("XFORMERS_AVAILABLE = True", "XFORMERS_AVAILABLE = False") - with attention_file_path.open("w") as out_f: - out_f.write(data) - -Prepare input data -~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from PIL import Image - - from notebook_utils import download_file, device_widget, quantization_widget - - if not Path("furseal.png").exists(): - download_file( - "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/3f779fc1-c1b2-4dec-915a-64dae510a2bb", - "furseal.png", - ) - - Image.open("furseal.png").resize((600, 400)) - - - -.. parsed-literal:: - - furseal.png: 0%| | 0.00/2.55M [00:00 - - - - -.. image:: depth-anything-v2-with-output_files/depth-anything-v2-with-output_15_1.png - - -Convert Model to OpenVINO IR format ------------------------------------ - - - -OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate -Representation (IR). OpenVINO model conversion API should be used for -these purposes. ``ov.convert_model`` function accepts original PyTorch -model instance and example input for tracing and returns ``ov.Model`` -representing this model in OpenVINO framework. Converted model can be -used for saving on disk using ``ov.save_model`` function or directly -loading on device using ``core.complie_model``. - -.. code:: ipython3 - - import openvino as ov - - OV_DEPTH_ANYTHING_PATH = Path(f"{model_id}.xml") - - if not OV_DEPTH_ANYTHING_PATH.exists(): - ov_model = ov.convert_model(model, example_input=torch.rand(1, 3, 518, 518), input=[1, 3, 518, 518]) - ov.save_model(ov_model, OV_DEPTH_ANYTHING_PATH) - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dinov2_layers/patch_embed.py:73: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dinov2_layers/patch_embed.py:74: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dinov2.py:183: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if npatch == N and w == h: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dpt.py:147: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True) - - -Run OpenVINO model inference ----------------------------- - - - -Now, we are ready to run OpenVINO model - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -For starting work, please select inference device from dropdown list. - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - core = ov.Core() - - compiled_model = core.compile_model(OV_DEPTH_ANYTHING_PATH, device.value) - -Run inference on image -~~~~~~~~~~~~~~~~~~~~~~ - - - -For simplicity of usage, model authors provide helper functions for -preprocessing input image. The main conditions are that image size -should be divisible on 14 (size of vit patch) and normalized in [0, 1] -range. - -.. code:: ipython3 - - from depth_anything_v2.util.transform import Resize, NormalizeImage, PrepareForNet - from torchvision.transforms import Compose - - transform = Compose( - [ - Resize( - width=518, - height=518, - resize_target=False, - ensure_multiple_of=14, - resize_method="lower_bound", - image_interpolation_method=cv2.INTER_CUBIC, - ), - NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), - PrepareForNet(), - ] - ) - -.. code:: ipython3 - - h, w = raw_img.shape[:-1] - - image = cv2.cvtColor(raw_img, cv2.COLOR_BGR2RGB) / 255.0 - image = transform({"image": image})["image"] - image = torch.from_numpy(image).unsqueeze(0) - - res = compiled_model(image)[0] - -.. code:: ipython3 - - depth_color = get_depth_map(res[0], w, h) - -.. code:: ipython3 - - plt.imshow(depth_color[:, :, ::-1]) - - - - -.. parsed-literal:: - - - - - - -.. image:: depth-anything-v2-with-output_files/depth-anything-v2-with-output_25_1.png - - -Run inference on video -~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - VIDEO_FILE = "./Coco Walking in Berkeley.mp4" - - if not Path(VIDEO_FILE).exists(): - download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/Coco%20Walking%20in%20Berkeley.mp4", - VIDEO_FILE, - ) - - # Number of seconds of input video to process. Set `NUM_SECONDS` to 0 to process - # the full video. - NUM_SECONDS = 4 - # Set `ADVANCE_FRAMES` to 1 to process every frame from the input video - # Set `ADVANCE_FRAMES` to 2 to process every second frame. This reduces - # the time it takes to process the video. - ADVANCE_FRAMES = 2 - # Set `SCALE_OUTPUT` to reduce the size of the result video - # If `SCALE_OUTPUT` is 0.5, the width and height of the result video - # will be half the width and height of the input video. - SCALE_OUTPUT = 0.5 - # The format to use for video encoding. The 'vp09` is slow, - # but it works on most systems. - # Try the `THEO` encoding if you have FFMPEG installed. - # FOURCC = cv2.VideoWriter_fourcc(*"THEO") - FOURCC = cv2.VideoWriter_fourcc(*"vp09") - - # Create Path objects for the input video and the result video. - output_directory = Path("output") - output_directory.mkdir(exist_ok=True) - result_video_path = output_directory / f"{Path(VIDEO_FILE).stem}_depth_anything.mp4" - - - -.. parsed-literal:: - - Coco Walking in Berkeley.mp4: 0%| | 0.00/877k [00:00 np.ndarray: - """ - Convert image_data from BGR to RGB - """ - return cv2.cvtColor(image_data, cv2.COLOR_BGR2RGB) - -.. code:: ipython3 - - import time - from IPython.display import ( - HTML, - FileLink, - Pretty, - ProgressBar, - Video, - clear_output, - display, - ) - - - def process_video(compiled_model, video_file, result_video_path): - # Initialize variables. - input_video_frame_nr = 0 - start_time = time.perf_counter() - total_inference_duration = 0 - - # Open the input video - cap = cv2.VideoCapture(str(video_file)) - - # Create a result video. - out_video = cv2.VideoWriter( - str(result_video_path), - FOURCC, - target_fps, - (target_frame_width * 2, target_frame_height), - ) - - num_frames = int(NUM_SECONDS * input_fps) - total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT) if num_frames == 0 else num_frames - progress_bar = ProgressBar(total=total_frames) - progress_bar.display() - - try: - while cap.isOpened(): - ret, image = cap.read() - if not ret: - cap.release() - break - - if input_video_frame_nr >= total_frames: - break - - h, w = image.shape[:-1] - input_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0 - input_image = transform({"image": input_image})["image"] - # Reshape the image to network input shape NCHW. - input_image = np.expand_dims(input_image, 0) - - # Do inference. - inference_start_time = time.perf_counter() - result = compiled_model(input_image)[0] - inference_stop_time = time.perf_counter() - inference_duration = inference_stop_time - inference_start_time - total_inference_duration += inference_duration - - if input_video_frame_nr % (10 * ADVANCE_FRAMES) == 0: - clear_output(wait=True) - progress_bar.display() - # input_video_frame_nr // ADVANCE_FRAMES gives the number of - # Frames that have been processed by the network. - display( - Pretty( - f"Processed frame {input_video_frame_nr // ADVANCE_FRAMES}" - f"/{total_frames // ADVANCE_FRAMES}. " - f"Inference time per frame: {inference_duration:.2f} seconds " - f"({1/inference_duration:.2f} FPS)" - ) - ) - - # Transform the network result to a RGB image. - result_frame = to_rgb(convert_result_to_image(result)) - # Resize the image and the result to a target frame shape. - result_frame = cv2.resize(result_frame, (target_frame_width, target_frame_height)) - image = cv2.resize(image, (target_frame_width, target_frame_height)) - # Put the image and the result side by side. - stacked_frame = np.hstack((image, result_frame)) - # Save a frame to the video. - out_video.write(stacked_frame) - - input_video_frame_nr = input_video_frame_nr + ADVANCE_FRAMES - cap.set(1, input_video_frame_nr) - - progress_bar.progress = input_video_frame_nr - progress_bar.update() - - except KeyboardInterrupt: - print("Processing interrupted.") - finally: - clear_output() - processed_frames = num_frames // ADVANCE_FRAMES - out_video.release() - cap.release() - end_time = time.perf_counter() - duration = end_time - start_time - - print( - f"Processed {processed_frames} frames in {duration:.2f} seconds. " - f"Total FPS (including video processing): {processed_frames/duration:.2f}." - f"Inference FPS: {processed_frames/total_inference_duration:.2f} " - ) - print(f"Video saved to '{str(result_video_path)}'.") - return stacked_frame - -.. code:: ipython3 - - stacked_frame = process_video(compiled_model, VIDEO_FILE, result_video_path) - - -.. parsed-literal:: - - Processed 60 frames in 13.14 seconds. Total FPS (including video processing): 4.57.Inference FPS: 10.69 - Video saved to 'output/Coco Walking in Berkeley_depth_anything.mp4'. - - -.. code:: ipython3 - - def display_video(stacked_frame): - video = Video(result_video_path, width=800, embed=True) - if not result_video_path.exists(): - plt.imshow(stacked_frame) - raise ValueError("OpenCV was unable to write the video file. Showing one video frame.") - else: - print(f"Showing video saved at\n{result_video_path.resolve()}") - print("If you cannot see the video in your browser, please click on the " "following link to download the video ") - video_link = FileLink(result_video_path) - video_link.html_link_str = "%s" - display(HTML(video_link._repr_html_())) - display(video) - -.. code:: ipython3 - - display_video(stacked_frame) - - -.. parsed-literal:: - - Showing video saved at - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/output/Coco Walking in Berkeley_depth_anything.mp4 - If you cannot see the video in your browser, please click on the following link to download the video - - - -.. raw:: html - - output/Coco Walking in Berkeley_depth_anything.mp4
- - - -.. raw:: html - - - - -Quantization ------------- - - - -`NNCF `__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -The optimization process contains the following steps: - -1. Create a calibration dataset for quantization. -2. Run ``nncf.quantize()`` to obtain quantized model. -3. Save the ``INT8`` model using ``openvino.save_model()`` function. - -Please select below whether you would like to run quantization to -improve model inference speed. - -.. code:: ipython3 - - to_quantize = quantization_widget() - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - OV_DEPTH_ANYTHING_INT8_PATH = Path(f"{model_id}_int8.xml") - - %load_ext skip_kernel_extension - -Prepare calibration dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We use a portion of -`Nahrawy/VIDIT-Depth-ControlNet `__ -dataset from Hugging Face as calibration data. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import datasets - - if not OV_DEPTH_ANYTHING_INT8_PATH.exists(): - subset_size = 300 - calibration_data = [] - dataset = datasets.load_dataset("Nahrawy/VIDIT-Depth-ControlNet", split="train", streaming=True).shuffle(seed=42).take(subset_size) - for batch in dataset: - image = np.array(batch["image"])[...,:3] - image = image / 255.0 - image = transform({'image': image})['image'] - image = np.expand_dims(image, 0) - calibration_data.append(image) - - - -.. parsed-literal:: - - Resolving data files: 0%| | 0/42 [00:00 **NOTE**: Quantization is time and memory consuming operation. Running -quantization code below may take some time. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import nncf - - if not OV_DEPTH_ANYTHING_INT8_PATH.exists(): - model = core.read_model(OV_DEPTH_ANYTHING_PATH) - quantized_model = nncf.quantize( - model=model, - subset_size=subset_size, - model_type=nncf.ModelType.TRANSFORMER, - calibration_dataset=nncf.Dataset(calibration_data), - ) - ov.save_model(quantized_model, OV_DEPTH_ANYTHING_INT8_PATH) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -.. parsed-literal:: - - 2025-02-04 00:03:38.061352: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2025-02-04 00:03:38.094509: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2025-02-04 00:03:38.671176: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - - - -.. parsed-literal:: - - Output() - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - -Let us check predictions with the quantized model using the same input -data. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - def visualize_results(orig_img:Image.Image, optimized_img:Image.Image): - """ - Helper function for results visualization - - Parameters: - orig_img (Image.Image): generated image using FP16 model - optimized_img (Image.Image): generated image using quantized model - Returns: - fig (matplotlib.pyplot.Figure): matplotlib generated figure contains drawing result - """ - orig_title = "FP16 model" - control_title = "INT8 model" - figsize = (20, 20) - fig, axs = plt.subplots(1, 2, figsize=figsize, sharex='all', sharey='all') - list_axes = list(axs.flat) - for a in list_axes: - a.set_xticklabels([]) - a.set_yticklabels([]) - a.get_xaxis().set_visible(False) - a.get_yaxis().set_visible(False) - a.grid(False) - list_axes[0].imshow(np.array(orig_img)) - list_axes[1].imshow(np.array(optimized_img)) - list_axes[0].set_title(orig_title, fontsize=15) - list_axes[1].set_title(control_title, fontsize=15) - - fig.subplots_adjust(wspace=0.01, hspace=0.01) - fig.tight_layout() - return fig - -.. code:: ipython3 - - %%skip not $to_quantize.value - - image = cv2.cvtColor(cv2.imread('furseal.png'), cv2.COLOR_BGR2RGB) / 255.0 - image = transform({'image': image})['image'] - image = torch.from_numpy(image).unsqueeze(0) - - int8_compiled_model = core.compile_model(OV_DEPTH_ANYTHING_INT8_PATH, device.value) - int8_res = int8_compiled_model(image)[0] - int8_depth_color = get_depth_map(int8_res[0], w, h) - -.. code:: ipython3 - - %%skip not $to_quantize.value - - visualize_results(depth_color[:, :, ::-1], int8_depth_color[:, :, ::-1]) - - - -.. image:: depth-anything-v2-with-output_files/depth-anything-v2-with-output_44_0.png - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - int8_result_video_path = output_directory / f"{Path(VIDEO_FILE).stem}_depth_anything_int8.mp4" - stacked_frame = process_video(int8_compiled_model, VIDEO_FILE, int8_result_video_path) - display_video(stacked_frame) - - -.. parsed-literal:: - - Processed 60 frames in 12.63 seconds. Total FPS (including video processing): 4.75.Inference FPS: 13.13 - Video saved to 'output/Coco Walking in Berkeley_depth_anything_int8.mp4'. - Showing video saved at - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/output/Coco Walking in Berkeley_depth_anything.mp4 - If you cannot see the video in your browser, please click on the following link to download the video - - - -.. raw:: html - - output/Coco Walking in Berkeley_depth_anything.mp4
- - - -.. raw:: html - - - - -Compare model file size -^^^^^^^^^^^^^^^^^^^^^^^ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - fp16_ir_model_size = OV_DEPTH_ANYTHING_PATH.with_suffix(".bin").stat().st_size / 2**20 - quantized_model_size = OV_DEPTH_ANYTHING_INT8_PATH.with_suffix(".bin").stat().st_size / 2**20 - - print(f"FP16 model size: {fp16_ir_model_size:.2f} MB") - print(f"INT8 model size: {quantized_model_size:.2f} MB") - print(f"Model compression rate: {fp16_ir_model_size / quantized_model_size:.3f}") - - -.. parsed-literal:: - - FP16 model size: 47.11 MB - INT8 model size: 24.41 MB - Model compression rate: 1.930 - - -Compare inference time of the FP16 and INT8 models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To measure the inference performance of OpenVINO FP16 and INT8 models, -use `Benchmark -Tool `__. - - **NOTE**: For the most accurate performance estimation, it is - recommended to run ``benchmark_app`` in a terminal/command prompt - after closing other applications. - -.. code:: ipython3 - - import re - - - def get_fps(benchmark_output: str): - parsed_output = [line for line in benchmark_output if "Throughput:" in line] - fps = re.findall(r"\d+\.\d+", parsed_output[0])[0] - return fps - - - if OV_DEPTH_ANYTHING_INT8_PATH.exists(): - benchmark_output = !benchmark_app -m $OV_DEPTH_ANYTHING_PATH -d $device.value -api async - original_fps = get_fps(benchmark_output) - print(f"FP16 Throughput: {original_fps} FPS") - - benchmark_output = !benchmark_app -m $OV_DEPTH_ANYTHING_INT8_PATH -d $device.value -api async - optimized_fps = get_fps(benchmark_output) - print(f"INT8 Throughput: {optimized_fps} FPS") - print(f"Speed-up: {float(optimized_fps) / float(original_fps):.2f}") - - -.. parsed-literal:: - - FP16 Throughput: 10.74 FPS - INT8 Throughput: 14.11 FPS - Speed-up: 1.31 - - -Interactive demo ----------------- - - - -You can apply model on own images. You can move the slider on the -resulting image to switch between the original image and the depth map -view. - -Please select below whether you would like to use the quantized model to -launch the interactive demo. - -.. code:: ipython3 - - import ipywidgets as widgets - - quantized_model_present = OV_DEPTH_ANYTHING_INT8_PATH.exists() - - use_quantized_model = widgets.Checkbox( - value=True if quantized_model_present else False, - description="Use quantized model", - disabled=False, - ) - - use_quantized_model - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Use quantized model') - - - -.. code:: ipython3 - - import numpy as np - import cv2 - import tempfile - - if use_quantized_model.value: - compiled_model = core.compile_model(OV_DEPTH_ANYTHING_INT8_PATH, device.value) - - - def predict_depth(model, image): - return model(image)[0] - - - def on_submit(image): - original_image = image.copy() - - h, w = image.shape[:2] - - image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0 - image = transform({"image": image})["image"] - image = np.expand_dims(image, 0) - - depth = predict_depth(compiled_model, image) - depth = cv2.resize(depth[0], (w, h), interpolation=cv2.INTER_LINEAR) - - depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 - depth = depth.astype(np.uint8) - colored_depth = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)[:, :, ::-1] - - colored_depth_img = Image.fromarray(colored_depth) - tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False) - colored_depth_img.save(tmp.name) - - return [(original_image, colored_depth), tmp.name] - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/depth-anything/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=on_submit, examples_dir="Depth-Anything-V2/assets/examples") - - try: - demo.queue().launch(debug=False) - except Exception: - demo.queue().launch(share=True, debug=False) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ - - -.. parsed-literal:: - - Running on local URL: http://127.0.0.1:7860 - - To create a public link, set `share=True` in `launch()`. - - - - - - - diff --git a/docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_15_1.png b/docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_15_1.png deleted file mode 100644 index a4e01af7f912e6..00000000000000 --- a/docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_15_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:da4d703d3b914455b7a7d10e2b43fe3b0a9ab3c9e4aa9336490d30b3fbe24015 -size 88497 diff --git a/docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_25_1.png b/docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_25_1.png deleted file mode 100644 index 9c25e30b532a13..00000000000000 --- a/docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_25_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:833f02eec49d4529fb1c46026ed9743fc23ff1343026bfb7c44c50bc32f19a1b -size 85083 diff --git a/docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_44_0.png b/docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_44_0.png deleted file mode 100644 index 4b565f09d15c18..00000000000000 --- a/docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_44_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:edcccebe443ff18301875414bac9c3187f1534cff693c073650975613e9c4aa9 -size 628003 diff --git a/docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_9_1.jpg b/docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_9_1.jpg deleted file mode 100644 index fbc0e8f5282659..00000000000000 --- a/docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_9_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c5bc6f9759cd2f8312bc66e3adc3e48d0a2c3f1d1393cd62672832004383085 -size 40145 diff --git a/docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_9_1.png b/docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_9_1.png deleted file mode 100644 index 096d76fe63cc45..00000000000000 --- a/docs/notebooks/depth-anything-v2-with-output_files/depth-anything-v2-with-output_9_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3f279ac10beacc7f27d1af73f7917d060c4c33db80b09d78c85dc864bd4bb3a7 -size 424224 diff --git a/docs/notebooks/depth-anything-with-output.rst b/docs/notebooks/depth-anything-with-output.rst deleted file mode 100644 index 2ff4d4c03f424d..00000000000000 --- a/docs/notebooks/depth-anything-with-output.rst +++ /dev/null @@ -1,1080 +0,0 @@ -Depth estimation with DepthAnything and OpenVINO -================================================ - -`Depth Anything `__ is a highly -practical solution for robust monocular depth estimation. Without -pursuing novel technical modules, this project aims to build a simple -yet powerful foundation model dealing with any images under any -circumstances. The framework of Depth Anything is shown below. it adopts -a standard pipeline to unleashing the power of large-scale unlabeled -images. |image.png| - -More details about model can be found in `project web -page `__, -`paper `__ and official -`repository `__ - -In this tutorial we will explore how to convert and run DepthAnything -using OpenVINO. An additional part demonstrates how to run quantization -with `NNCF `__ to speed up the -model. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Load and run PyTorch model <#load-and-run-pytorch-model>`__ - - - `Prepare input data <#prepare-input-data>`__ - - `Run model inference <#run-model-inference>`__ - -- `Convert Model to OpenVINO IR - format <#convert-model-to-openvino-ir-format>`__ -- `Run OpenVINO model inference <#run-openvino-model-inference>`__ - - - `Select inference device <#select-inference-device>`__ - - `Run inference on image <#run-inference-on-image>`__ - - `Run inference on video <#run-inference-on-video>`__ - -- `Quantization <#quantization>`__ - - - `Prepare calibration dataset <#prepare-calibration-dataset>`__ - - `Run quantization <#run-quantization>`__ - - `Compare inference time of the FP16 and INT8 - models <#compare-inference-time-of-the-fp16-and-int8-models>`__ - - `Compare model file size <#compare-unet-file-size>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. |image.png| image:: https://depth-anything.github.io/static/images/pipeline.png - -Prerequisites -------------- - - - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - -.. code:: ipython3 - - from cmd_helper import clone_repo - - repo_dir = clone_repo("https://github.com/LiheYoung/Depth-Anything") - - %cd $repo_dir - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything - - -.. code:: ipython3 - - import platform - - - %pip install -q "openvino>=2023.3.0" "datasets>=2.14.6" "nncf" "tqdm" - %pip install -q "typing-extensions>=4.9.0" eval-type-backport "gradio>=4.19" "matplotlib>=3.4" - %pip install -q torch torchvision "opencv-python" huggingface_hub --extra-index-url https://download.pytorch.org/whl/cpu - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - if platform.python_version_tuple()[1] in ["8", "9"]: - %pip install -q "gradio-imageslider<=0.0.17" "typing-extensions>=4.9.0" - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - - -Load and run PyTorch model --------------------------- - - - -To be able run PyTorch model on CPU, we should disable xformers -attention optimizations first. - -.. code:: ipython3 - - from pathlib import Path - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("depth-anything.ipynb") - - - attention_file_path = Path("./torchhub/facebookresearch_dinov2_main/dinov2/layers/attention.py") - orig_attention_path = attention_file_path.parent / ("orig_" + attention_file_path.name) - - if not orig_attention_path.exists(): - attention_file_path.rename(orig_attention_path) - - with orig_attention_path.open("r") as f: - data = f.read() - data = data.replace("XFORMERS_AVAILABLE = True", "XFORMERS_AVAILABLE = False") - with attention_file_path.open("w") as out_f: - out_f.write(data) - -``DepthAnything.from_pretrained`` method creates PyTorch model class -instance and load model weights. There are 3 available models in -repository depends on VIT encoder size: \* Depth-Anything-ViT-Small -(24.8M) \* Depth-Anything-ViT-Base (97.5M) \* Depth-Anything-ViT-Large -(335.3M) - -We will use ``Depth-Anything-ViT-Small``, but the same steps for running -model and converting to OpenVINO are applicable for other models from -DepthAnything family. - -.. code:: ipython3 - - from depth_anything.dpt import DepthAnything - - encoder = "vits" # can also be 'vitb' or 'vitl' - model_id = "depth_anything_{:}14".format(encoder) - depth_anything = DepthAnything.from_pretrained(f"LiheYoung/{model_id}") - - -.. parsed-literal:: - - xFormers not available - xFormers not available - - -Prepare input data -~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from PIL import Image - - from notebook_utils import download_file, device_widget, quantization_widget - - if not Path("furseal.png").exists(): - download_file( - "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/3f779fc1-c1b2-4dec-915a-64dae510a2bb", - "furseal.png", - ) - - Image.open("furseal.png").resize((600, 400)) - - - -.. parsed-literal:: - - furseal.png: 0%| | 0.00/2.55M [00:00 np.ndarray: - """ - Convert image_data from BGR to RGB - """ - return cv2.cvtColor(image_data, cv2.COLOR_BGR2RGB) - -.. code:: ipython3 - - import time - from IPython.display import ( - HTML, - FileLink, - Pretty, - ProgressBar, - Video, - clear_output, - display, - ) - - - def process_video(compiled_model, video_file, result_video_path): - # Initialize variables. - input_video_frame_nr = 0 - start_time = time.perf_counter() - total_inference_duration = 0 - - # Open the input video - cap = cv2.VideoCapture(str(video_file)) - - # Create a result video. - out_video = cv2.VideoWriter( - str(result_video_path), - FOURCC, - target_fps, - (target_frame_width * 2, target_frame_height), - ) - - num_frames = int(NUM_SECONDS * input_fps) - total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT) if num_frames == 0 else num_frames - progress_bar = ProgressBar(total=total_frames) - progress_bar.display() - - try: - while cap.isOpened(): - ret, image = cap.read() - if not ret: - cap.release() - break - - if input_video_frame_nr >= total_frames: - break - - h, w = image.shape[:-1] - input_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0 - input_image = transform({"image": input_image})["image"] - # Reshape the image to network input shape NCHW. - input_image = np.expand_dims(input_image, 0) - - # Do inference. - inference_start_time = time.perf_counter() - result = compiled_model(input_image)[0] - inference_stop_time = time.perf_counter() - inference_duration = inference_stop_time - inference_start_time - total_inference_duration += inference_duration - - if input_video_frame_nr % (10 * ADVANCE_FRAMES) == 0: - clear_output(wait=True) - progress_bar.display() - # input_video_frame_nr // ADVANCE_FRAMES gives the number of - # Frames that have been processed by the network. - display( - Pretty( - f"Processed frame {input_video_frame_nr // ADVANCE_FRAMES}" - f"/{total_frames // ADVANCE_FRAMES}. " - f"Inference time per frame: {inference_duration:.2f} seconds " - f"({1/inference_duration:.2f} FPS)" - ) - ) - - # Transform the network result to a RGB image. - result_frame = to_rgb(convert_result_to_image(result)) - # Resize the image and the result to a target frame shape. - result_frame = cv2.resize(result_frame, (target_frame_width, target_frame_height)) - image = cv2.resize(image, (target_frame_width, target_frame_height)) - # Put the image and the result side by side. - stacked_frame = np.hstack((image, result_frame)) - # Save a frame to the video. - out_video.write(stacked_frame) - - input_video_frame_nr = input_video_frame_nr + ADVANCE_FRAMES - cap.set(1, input_video_frame_nr) - - progress_bar.progress = input_video_frame_nr - progress_bar.update() - - except KeyboardInterrupt: - print("Processing interrupted.") - finally: - clear_output() - processed_frames = num_frames // ADVANCE_FRAMES - out_video.release() - cap.release() - end_time = time.perf_counter() - duration = end_time - start_time - - print( - f"Processed {processed_frames} frames in {duration:.2f} seconds. " - f"Total FPS (including video processing): {processed_frames/duration:.2f}." - f"Inference FPS: {processed_frames/total_inference_duration:.2f} " - ) - print(f"Video saved to '{str(result_video_path)}'.") - return stacked_frame - -.. code:: ipython3 - - stacked_frame = process_video(compiled_model, VIDEO_FILE, result_video_path) - - -.. parsed-literal:: - - Processed 60 frames in 13.17 seconds. Total FPS (including video processing): 4.56.Inference FPS: 10.67 - Video saved to 'output/Coco Walking in Berkeley_depth_anything.mp4'. - - -.. code:: ipython3 - - def display_video(stacked_frame): - video = Video(result_video_path, width=800, embed=True) - if not result_video_path.exists(): - plt.imshow(stacked_frame) - raise ValueError("OpenCV was unable to write the video file. Showing one video frame.") - else: - print(f"Showing video saved at\n{result_video_path.resolve()}") - print("If you cannot see the video in your browser, please click on the " "following link to download the video ") - video_link = FileLink(result_video_path) - video_link.html_link_str = "%s" - display(HTML(video_link._repr_html_())) - display(video) - -.. code:: ipython3 - - display_video(stacked_frame) - - -.. parsed-literal:: - - Showing video saved at - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything/output/Coco Walking in Berkeley_depth_anything.mp4 - If you cannot see the video in your browser, please click on the following link to download the video - - - -.. raw:: html - - output/Coco Walking in Berkeley_depth_anything.mp4
- - - -.. raw:: html - - - - -Quantization ------------- - - - -`NNCF `__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -The optimization process contains the following steps: - -1. Create a calibration dataset for quantization. -2. Run ``nncf.quantize()`` to obtain quantized model. -3. Save the ``INT8`` model using ``openvino.save_model()`` function. - -Please select below whether you would like to run quantization to -improve model inference speed. - -.. code:: ipython3 - - to_quantize = quantization_widget() - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - OV_DEPTH_ANYTHING_INT8_PATH = Path(f"{model_id}_int8.xml") - - %load_ext skip_kernel_extension - -Prepare calibration dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We use a portion of -`Nahrawy/VIDIT-Depth-ControlNet `__ -dataset from Hugging Face as calibration data. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import datasets - - if not OV_DEPTH_ANYTHING_INT8_PATH.exists(): - subset_size = 300 - calibration_data = [] - dataset = datasets.load_dataset("Nahrawy/VIDIT-Depth-ControlNet", split="train", streaming=True).shuffle(seed=42).take(subset_size) - for batch in dataset: - image = np.array(batch["image"])[...,:3] - image = image / 255.0 - image = transform({'image': image})['image'] - image = np.expand_dims(image, 0) - calibration_data.append(image) - - - -.. parsed-literal:: - - Resolving data files: 0%| | 0/42 [00:00 **NOTE**: Quantization is time and memory consuming operation. Running -quantization code below may take some time. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import nncf - - if not OV_DEPTH_ANYTHING_INT8_PATH.exists(): - model = core.read_model(OV_DEPTH_ANYTHING_PATH) - quantized_model = nncf.quantize( - model=model, - subset_size=subset_size, - model_type=nncf.ModelType.TRANSFORMER, - calibration_dataset=nncf.Dataset(calibration_data), - ) - ov.save_model(quantized_model, OV_DEPTH_ANYTHING_INT8_PATH) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -.. parsed-literal:: - - 2025-02-04 00:12:23.299021: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2025-02-04 00:12:23.331758: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2025-02-04 00:12:23.910412: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - - - -.. parsed-literal:: - - Output() - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - -Let us check predictions with the quantized model using the same input -data. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - def visualize_results(orig_img:Image.Image, optimized_img:Image.Image): - """ - Helper function for results visualization - - Parameters: - orig_img (Image.Image): generated image using FP16 model - optimized_img (Image.Image): generated image using quantized model - Returns: - fig (matplotlib.pyplot.Figure): matplotlib generated figure contains drawing result - """ - orig_title = "FP16 model" - control_title = "INT8 model" - figsize = (20, 20) - fig, axs = plt.subplots(1, 2, figsize=figsize, sharex='all', sharey='all') - list_axes = list(axs.flat) - for a in list_axes: - a.set_xticklabels([]) - a.set_yticklabels([]) - a.get_xaxis().set_visible(False) - a.get_yaxis().set_visible(False) - a.grid(False) - list_axes[0].imshow(np.array(orig_img)) - list_axes[1].imshow(np.array(optimized_img)) - list_axes[0].set_title(orig_title, fontsize=15) - list_axes[1].set_title(control_title, fontsize=15) - - fig.subplots_adjust(wspace=0.01, hspace=0.01) - fig.tight_layout() - return fig - -.. code:: ipython3 - - %%skip not $to_quantize.value - - image = cv2.cvtColor(cv2.imread('furseal.png'), cv2.COLOR_BGR2RGB) / 255.0 - image = transform({'image': image})['image'] - image = torch.from_numpy(image).unsqueeze(0) - - int8_compiled_model = core.compile_model(OV_DEPTH_ANYTHING_INT8_PATH, device.value) - int8_res = int8_compiled_model(image)[0] - int8_depth_color = get_depth_map(int8_res) - -.. code:: ipython3 - - %%skip not $to_quantize.value - - visualize_results(depth_color[:, :, ::-1], int8_depth_color[:, :, ::-1]) - - - -.. image:: depth-anything-with-output_files/depth-anything-with-output_46_0.png - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - int8_result_video_path = output_directory / f"{Path(VIDEO_FILE).stem}_depth_anything_int8.mp4" - stacked_frame = process_video(int8_compiled_model, VIDEO_FILE, int8_result_video_path) - display_video(stacked_frame) - - -.. parsed-literal:: - - Processed 60 frames in 13.30 seconds. Total FPS (including video processing): 4.51.Inference FPS: 11.47 - Video saved to 'output/Coco Walking in Berkeley_depth_anything_int8.mp4'. - Showing video saved at - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything/output/Coco Walking in Berkeley_depth_anything.mp4 - If you cannot see the video in your browser, please click on the following link to download the video - - - -.. raw:: html - - output/Coco Walking in Berkeley_depth_anything.mp4
- - - -.. raw:: html - - - - -Compare model file size -^^^^^^^^^^^^^^^^^^^^^^^ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - fp16_ir_model_size = OV_DEPTH_ANYTHING_PATH.with_suffix(".bin").stat().st_size / 2**20 - quantized_model_size = OV_DEPTH_ANYTHING_INT8_PATH.with_suffix(".bin").stat().st_size / 2**20 - - print(f"FP16 model size: {fp16_ir_model_size:.2f} MB") - print(f"INT8 model size: {quantized_model_size:.2f} MB") - print(f"Model compression rate: {fp16_ir_model_size / quantized_model_size:.3f}") - - -.. parsed-literal:: - - FP16 model size: 47.11 MB - INT8 model size: 24.41 MB - Model compression rate: 1.930 - - -Compare inference time of the FP16 and INT8 models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To measure the inference performance of OpenVINO FP16 and INT8 models, -use `Benchmark -Tool `__. - - **NOTE**: For the most accurate performance estimation, it is - recommended to run ``benchmark_app`` in a terminal/command prompt - after closing other applications. - -.. code:: ipython3 - - import re - - - def get_fps(benchmark_output: str): - parsed_output = [line for line in benchmark_output if "Throughput:" in line] - fps = re.findall(r"\d+\.\d+", parsed_output[0])[0] - return fps - - - if OV_DEPTH_ANYTHING_INT8_PATH.exists(): - benchmark_output = !benchmark_app -m $OV_DEPTH_ANYTHING_PATH -d $device.value -api async - original_fps = get_fps(benchmark_output) - print(f"FP16 Throughput: {original_fps} FPS") - - benchmark_output = !benchmark_app -m $OV_DEPTH_ANYTHING_INT8_PATH -d $device.value -api async - optimized_fps = get_fps(benchmark_output) - print(f"INT8 Throughput: {optimized_fps} FPS") - print(f"Speed-up: {float(optimized_fps) / float(original_fps):.2f}") - - -.. parsed-literal:: - - FP16 Throughput: 10.68 FPS - INT8 Throughput: 14.11 FPS - Speed-up: 1.32 - - -Interactive demo ----------------- - - - -You can apply model on own images. You can move the slider on the -resulting image to switch between the original image and the depth map -view. - -Please select below whether you would like to use the quantized model to -launch the interactive demo. - -.. code:: ipython3 - - import ipywidgets as widgets - - quantized_model_present = OV_DEPTH_ANYTHING_INT8_PATH.exists() - - use_quantized_model = widgets.Checkbox( - value=True if quantized_model_present else False, - description="Use quantized model", - disabled=False, - ) - - use_quantized_model - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Use quantized model') - - - -.. code:: ipython3 - - import numpy as np - import cv2 - import tempfile - - if use_quantized_model.value: - compiled_model = core.compile_model(OV_DEPTH_ANYTHING_INT8_PATH, device.value) - - - def predict_depth(model, image): - return model(image)[0] - - - def on_submit(image): - original_image = image.copy() - - h, w = image.shape[:2] - - image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0 - image = transform({"image": image})["image"] - image = np.expand_dims(image, 0) - - depth = predict_depth(compiled_model, image) - depth = cv2.resize(depth[0], (w, h), interpolation=cv2.INTER_LINEAR) - - depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 - depth = depth.astype(np.uint8) - colored_depth = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)[:, :, ::-1] - - colored_depth_img = Image.fromarray(colored_depth) - tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False) - colored_depth_img.save(tmp.name) - - return [(original_image, colored_depth), tmp.name] - -.. code:: ipython3 - - # Go back to the depth-anything notebook directory - %cd .. - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/depth-anything/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=on_submit, examples_dir="Depth-Anything/assets/examples") - - try: - demo.queue().launch(debug=False) - except Exception: - demo.queue().launch(share=True, debug=False) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/depth-anything - Running on local URL: http://127.0.0.1:7860 - - To create a public link, set `share=True` in `launch()`. - - - - - - - diff --git a/docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_11_1.jpg b/docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_11_1.jpg deleted file mode 100644 index fbc0e8f5282659..00000000000000 --- a/docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_11_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c5bc6f9759cd2f8312bc66e3adc3e48d0a2c3f1d1393cd62672832004383085 -size 40145 diff --git a/docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_11_1.png b/docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_11_1.png deleted file mode 100644 index 096d76fe63cc45..00000000000000 --- a/docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_11_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3f279ac10beacc7f27d1af73f7917d060c4c33db80b09d78c85dc864bd4bb3a7 -size 424224 diff --git a/docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_18_0.png b/docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_18_0.png deleted file mode 100644 index 5d80ab50962d2c..00000000000000 --- a/docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_18_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a1cd87d709724c5f50721990ec02870e4e4d11566770dd749ec886bb94354d31 -size 80753 diff --git a/docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_27_0.png b/docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_27_0.png deleted file mode 100644 index e98ae4a16ac1c0..00000000000000 --- a/docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_27_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:50c015f2d9f0e9fdbed7023c9870da4e5ab2556ff232be54f00583865d1e3ef6 -size 80771 diff --git a/docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_46_0.png b/docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_46_0.png deleted file mode 100644 index 05c6539ba9c7a6..00000000000000 --- a/docs/notebooks/depth-anything-with-output_files/depth-anything-with-output_46_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:05cf8301ed553b4cf6755c36a8bf39af02f2416f08aa5f1cffcdd3ad04065159 -size 451417 diff --git a/docs/notebooks/detectron2-to-openvino-with-output.rst b/docs/notebooks/detectron2-to-openvino-with-output.rst deleted file mode 100644 index 3206f71ba192b3..00000000000000 --- a/docs/notebooks/detectron2-to-openvino-with-output.rst +++ /dev/null @@ -1,623 +0,0 @@ -Convert Detectron2 Models to OpenVINO™ -========================================= - -`Detectron2 `__ is -Facebook AI Research’s library that provides state-of-the-art detection -and segmentation algorithms. It is the successor of -`Detectron `__ and -`maskrcnn-benchmark `__. -It supports a number of computer vision research projects and production -applications. - -In this tutorial we consider how to convert and run Detectron2 models -using OpenVINO™. We will use ``Faster R-CNN FPN x1`` model and -``Mask R-CNN FPN x3`` pretrained on -`COCO `__ dataset as examples for object -detection and instance segmentation respectively. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ - - - `Define helpers for PyTorch model initialization and - conversion <#define-helpers-for-pytorch-model-initialization-and-conversion>`__ - - `Prepare input data <#prepare-input-data>`__ - -- `Object Detection <#object-detection>`__ - - - `Download PyTorch Detection - model <#download-pytorch-detection-model>`__ - - `Convert Detection Model to OpenVINO Intermediate - Representation <#convert-detection-model-to-openvino-intermediate-representation>`__ - - `Select inference device <#select-inference-device>`__ - - `Run Detection model inference <#run-detection-model-inference>`__ - -- `Instance Segmentation <#instance-segmentation>`__ - - - `Download Instance Segmentation PyTorch - model <#download-instance-segmentation-pytorch-model>`__ - - `Convert Instance Segmentation Model to OpenVINO Intermediate - Representation <#convert-instance-segmentation-model-to-openvino-intermediate-representation>`__ - - `Select inference device <#select-inference-device>`__ - - `Run Instance Segmentation model - inference <#run-instance-segmentation-model-inference>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Install required packages for running model - -.. code:: ipython3 - - import os - import requests - from pathlib import Path - import platform - - - os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - with open("notebook_utils.py", "w") as f: - f.write(r.text) - - if not Path("pip_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/pip_helper.py", - ) - open("pip_helper.py", "w").write(r.text) - - from pip_helper import pip_install - - if platform.system() == "Darwin": - pip_install("numpy<2.0.0") - pip_install("torch", "torchvision", "opencv-python", "wheel", "--extra-index-url", "https://download.pytorch.org/whl/cpu") - pip_install("git+https://github.com/facebookresearch/detectron2.git", "--extra-index-url", "https://download.pytorch.org/whl/cpu") - pip_install("openvino>=2023.1.0") - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("detectron2-to-openvino.ipynb") - - -.. parsed-literal:: - - Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu - Requirement already satisfied: torch in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (2.4.1+cpu) - Requirement already satisfied: torchvision in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (0.19.1+cpu) - Requirement already satisfied: opencv-python in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (4.11.0.86) - Requirement already satisfied: wheel in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (0.45.1) - Requirement already satisfied: filelock in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (3.16.1) - Requirement already satisfied: typing-extensions>=4.8.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (4.12.2) - Requirement already satisfied: sympy in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (1.13.3) - Requirement already satisfied: networkx in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (3.1) - Requirement already satisfied: jinja2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (3.1.5) - Requirement already satisfied: fsspec in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (2024.9.0) - Requirement already satisfied: numpy in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torchvision) (1.24.3) - Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torchvision) (10.4.0) - Requirement already satisfied: MarkupSafe>=2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from jinja2->torch) (2.1.5) - Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from sympy->torch) (1.3.0) - Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu - Collecting git+https://github.com/facebookresearch/detectron2.git - Cloning https://github.com/facebookresearch/detectron2.git to /tmp/pip-req-build-qlom32g0 - - -.. parsed-literal:: - - Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /tmp/pip-req-build-qlom32g0 - - -.. parsed-literal:: - - Resolved https://github.com/facebookresearch/detectron2.git to commit 9604f5995cc628619f0e4fd913453b4d7d61db3f - Preparing metadata (setup.py): started - Preparing metadata (setup.py): finished with status 'done' - Requirement already satisfied: Pillow>=7.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (10.4.0) - Requirement already satisfied: black in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (24.3.0) - Requirement already satisfied: cloudpickle in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (3.1.1) - Requirement already satisfied: fvcore<0.1.6,>=0.1.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (0.1.5.post20221221) - Collecting hydra-core>=1.1 (from detectron2==0.6) - Using cached hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB) - Collecting iopath<0.1.10,>=0.1.7 (from detectron2==0.6) - Using cached https://download.pytorch.org/whl/iopath-0.1.9-py3-none-any.whl (27 kB) - Requirement already satisfied: matplotlib in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (3.7.5) - Collecting omegaconf<2.4,>=2.1 (from detectron2==0.6) - Using cached omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB) - Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (24.2) - Requirement already satisfied: pycocotools>=2.0.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (2.0.7) - Requirement already satisfied: tabulate in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (0.9.0) - Requirement already satisfied: tensorboard in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (2.13.0) - Requirement already satisfied: termcolor>=1.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (2.4.0) - Requirement already satisfied: tqdm>4.29.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (4.67.1) - Requirement already satisfied: yacs>=0.1.8 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (0.1.8) - Requirement already satisfied: numpy in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from fvcore<0.1.6,>=0.1.5->detectron2==0.6) (1.24.3) - Requirement already satisfied: pyyaml>=5.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from fvcore<0.1.6,>=0.1.5->detectron2==0.6) (6.0.2) - Requirement already satisfied: antlr4-python3-runtime==4.9.* in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.1->detectron2==0.6) (4.9.3) - Requirement already satisfied: importlib-resources in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.1->detectron2==0.6) (6.4.5) - Requirement already satisfied: portalocker in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath<0.1.10,>=0.1.7->detectron2==0.6) (3.0.0) - Requirement already satisfied: contourpy>=1.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (1.1.1) - Requirement already satisfied: cycler>=0.10 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (0.12.1) - Requirement already satisfied: fonttools>=4.22.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (4.55.8) - Requirement already satisfied: kiwisolver>=1.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (1.4.7) - Requirement already satisfied: pyparsing>=2.3.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (3.1.4) - Requirement already satisfied: python-dateutil>=2.7 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (2.9.0.post0) - Requirement already satisfied: click>=8.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (8.1.8) - Requirement already satisfied: mypy-extensions>=0.4.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (1.0.0) - Requirement already satisfied: pathspec>=0.9.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (0.12.1) - Requirement already satisfied: platformdirs>=2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (4.3.6) - Requirement already satisfied: tomli>=1.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (2.2.1) - Requirement already satisfied: typing-extensions>=4.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (4.12.2) - Requirement already satisfied: absl-py>=0.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (1.4.0) - Requirement already satisfied: grpcio>=1.48.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (1.70.0) - Requirement already satisfied: google-auth<3,>=1.6.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (2.38.0) - Requirement already satisfied: google-auth-oauthlib<1.1,>=0.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (1.0.0) - Requirement already satisfied: markdown>=2.6.8 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (3.7) - Requirement already satisfied: protobuf>=3.19.6 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (3.20.3) - Requirement already satisfied: requests<3,>=2.21.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (2.32.3) - Requirement already satisfied: setuptools>=41.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (44.0.0) - Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (0.7.2) - Requirement already satisfied: werkzeug>=1.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (3.0.6) - Requirement already satisfied: wheel>=0.26 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (0.45.1) - Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (5.5.1) - Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (0.4.1) - Requirement already satisfied: rsa<5,>=3.1.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (4.9) - Requirement already satisfied: requests-oauthlib>=0.7.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth-oauthlib<1.1,>=0.5->tensorboard->detectron2==0.6) (2.0.0) - Requirement already satisfied: zipp>=3.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from importlib-resources->hydra-core>=1.1->detectron2==0.6) (3.20.2) - Requirement already satisfied: importlib-metadata>=4.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from markdown>=2.6.8->tensorboard->detectron2==0.6) (8.5.0) - Requirement already satisfied: six>=1.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from python-dateutil>=2.7->matplotlib->detectron2==0.6) (1.17.0) - Requirement already satisfied: charset-normalizer<4,>=2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (3.4.1) - Requirement already satisfied: idna<4,>=2.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (3.10) - Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (2.2.3) - Requirement already satisfied: certifi>=2017.4.17 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (2025.1.31) - Requirement already satisfied: MarkupSafe>=2.1.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from werkzeug>=1.0.1->tensorboard->detectron2==0.6) (2.1.5) - Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (0.6.1) - Requirement already satisfied: oauthlib>=3.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<1.1,>=0.5->tensorboard->detectron2==0.6) (3.2.2) - Using cached hydra_core-1.3.2-py3-none-any.whl (154 kB) - Using cached omegaconf-2.3.0-py3-none-any.whl (79 kB) - Building wheels for collected packages: detectron2 - Building wheel for detectron2 (setup.py): started - Building wheel for detectron2 (setup.py): finished with status 'done' - Created wheel for detectron2: filename=detectron2-0.6-cp38-cp38-linux_x86_64.whl size=8313065 sha256=53d05fc2b286cf1ed935fc3e9ee871e98783e0bc0f2efef495bafae54f7733ea - Stored in directory: /tmp/pip-ephem-wheel-cache-u1zzwsfg/wheels/19/ac/65/e48e5e4ec2702274d927c5a6efb75709b24014371d3bb778f2 - Successfully built detectron2 - Installing collected packages: omegaconf, iopath, hydra-core, detectron2 - Attempting uninstall: omegaconf - Found existing installation: omegaconf 2.4.0.dev3 - Uninstalling omegaconf-2.4.0.dev3: - Successfully uninstalled omegaconf-2.4.0.dev3 - Attempting uninstall: iopath - Found existing installation: iopath 0.1.10 - Uninstalling iopath-0.1.10: - Successfully uninstalled iopath-0.1.10 - Successfully installed detectron2-0.6 hydra-core-1.3.2 iopath-0.1.9 omegaconf-2.3.0 - Requirement already satisfied: openvino>=2023.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (2024.4.0) - Requirement already satisfied: numpy<2.1.0,>=1.16.6 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2023.1.0) (1.24.3) - Requirement already satisfied: openvino-telemetry>=2023.2.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2023.1.0) (2025.0.0) - Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2023.1.0) (24.2) - - -Define helpers for PyTorch model initialization and conversion -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Detectron2 provides universal and configurable API for working with -models, it means that all steps required for model creation, conversion -and inference will be common for all models, that is why it is enough to -define helper functions once, then reuse them for different models. For -obtaining models we will use `Detectron2 Model -Zoo `__ -API. ``detecton_zoo.get`` function allow to download and instantiate -model based on its config file. Configuration file is playing key role -in interaction with models in Detectron2 project and describes model -architecture and training and validation processes. -``detectron_zoo.get_config`` function can be used for finding and -reading model config. - -.. code:: ipython3 - - import detectron2.model_zoo as detectron_zoo - - - def get_model_and_config(model_name: str): - """ - Helper function for downloading PyTorch model and its configuration from Detectron2 Model Zoo - - Parameters: - model_name (str): model_id from Detectron2 Model Zoo - Returns: - model (torch.nn.Module): Pretrained model instance - cfg (Config): Configuration for model - """ - cfg = detectron_zoo.get_config(model_name + ".yaml", trained=True) - model = detectron_zoo.get(model_name + ".yaml", trained=True) - return model, cfg - -Detectron2 library is based on PyTorch. Starting from 2023.0 release -OpenVINO supports PyTorch models conversion directly via Model -Conversion API. ``ov.convert_model`` function can be used for converting -PyTorch model to OpenVINO Model object instance, that ready to use for -loading on device and then running inference or can be saved on disk for -next deployment using ``ov.save_model`` function. - -Detectron2 models use custom complex data structures inside that brings -some difficulties for exporting models in different formats and -frameworks including OpenVINO. For avoid these issues, -``detectron2.export.TracingAdapter`` provided as part of Detectron2 -deployment API. ``TracingAdapter`` is a model wrapper class that -simplify model’s structure making it more export-friendly. - -.. code:: ipython3 - - from detectron2.modeling import GeneralizedRCNN - from detectron2.export import TracingAdapter - import torch - import openvino as ov - import warnings - from typing import List, Dict - - - def convert_detectron2_model(model: torch.nn.Module, sample_input: List[Dict[str, torch.Tensor]]): - """ - Function for converting Detectron2 models, creates TracingAdapter for making model tracing-friendly, - prepares inputs and converts model to OpenVINO Model - - Parameters: - model (torch.nn.Module): Model object for conversion - sample_input (List[Dict[str, torch.Tensor]]): sample input for tracing - Returns: - ov_model (ov.Model): OpenVINO Model - """ - # prepare input for tracing adapter - tracing_input = [{"image": sample_input[0]["image"]}] - - # override model forward and disable postprocessing if required - if isinstance(model, GeneralizedRCNN): - - def inference(model, inputs): - # use do_postprocess=False so it returns ROI mask - inst = model.inference(inputs, do_postprocess=False)[0] - return [{"instances": inst}] - - else: - inference = None # assume that we just call the model directly - - # create traceable model - traceable_model = TracingAdapter(model, tracing_input, inference) - warnings.filterwarnings("ignore") - # convert PyTorch model to OpenVINO model - ov_model = ov.convert_model(traceable_model, example_input=sample_input[0]["image"]) - return ov_model - -Prepare input data -~~~~~~~~~~~~~~~~~~ - - - -For running model conversion and inference we need to provide example -input. The cells below download sample image and apply preprocessing -steps based on model specific transformations defined in model config. - -.. code:: ipython3 - - import requests - from pathlib import Path - from PIL import Image - - MODEL_DIR = Path("model") - DATA_DIR = Path("data") - - MODEL_DIR.mkdir(exist_ok=True) - DATA_DIR.mkdir(exist_ok=True) - - input_image_url = "https://farm9.staticflickr.com/8040/8017130856_1b46b5f5fc_z.jpg" - - image_file = DATA_DIR / "example_image.jpg" - - if not image_file.exists(): - image = Image.open(requests.get(input_image_url, stream=True).raw) - image.save(image_file) - else: - image = Image.open(image_file) - - image - - - - -.. image:: detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_8_0.png - - - -.. code:: ipython3 - - import detectron2.data.transforms as T - from detectron2.data import detection_utils - import torch - - - def get_sample_inputs(image_path, cfg): - # get a sample data - original_image = detection_utils.read_image(image_path, format=cfg.INPUT.FORMAT) - # Do same preprocessing as DefaultPredictor - aug = T.ResizeShortestEdge([cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST) - height, width = original_image.shape[:2] - image = aug.get_transform(original_image).apply_image(original_image) - image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) - - inputs = {"image": image, "height": height, "width": width} - - # Sample ready - sample_inputs = [inputs] - return sample_inputs - -Now, when all components required for model conversion are prepared, we -can consider how to use them on specific examples. - -Object Detection ----------------- - - - -Download PyTorch Detection model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Download faster_rcnn_R_50_FPN_1x from Detectron Model Zoo. - -.. code:: ipython3 - - model_name = "COCO-Detection/faster_rcnn_R_50_FPN_1x" - model, cfg = get_model_and_config(model_name) - sample_input = get_sample_inputs(image_file, cfg) - -Convert Detection Model to OpenVINO Intermediate Representation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Convert model using ``convert_detectron2_model`` function and -``sample_input`` prepared above. After conversion, model saved on disk -using ``ov.save_model`` function and can be found in ``model`` -directory. - -.. code:: ipython3 - - model_xml_path = MODEL_DIR / (model_name.split("/")[-1] + ".xml") - if not model_xml_path.exists(): - ov_model = convert_detectron2_model(model, sample_input) - ov.save_model(ov_model, MODEL_DIR / (model_name.split("/")[-1] + ".xml")) - else: - ov_model = model_xml_path - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - core = ov.Core() - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Run Detection model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Load our converted model on selected device and run inference on sample -input. - -.. code:: ipython3 - - compiled_model = core.compile_model(ov_model, device.value) - -.. code:: ipython3 - - results = compiled_model(sample_input[0]["image"]) - -Tracing adapter simplifies model input and output format. After -conversion, model has multiple outputs in following format: 1. Predicted -boxes is floating-point tensor in format [``N``, 4], where N is number -of detected boxes. 2. Predicted classes is integer tensor in format -[``N``], where N is number of predicted objects that defines which label -each object belongs. The values range of predicted classes tensor is [0, -``num_labels``], where ``num_labels`` is number of classes supported of -model (in our case 80). 3. Predicted scores is floating-point tensor in -format [``N``], where ``N`` is number of predicted objects that defines -confidence of each prediction. 4. Input image size is integer tensor -with values [``H``, ``W``], where ``H`` is height of input data and -``W`` is width of input data, used for rescaling predictions on -postprocessing step. - -For reusing Detectron2 API for postprocessing and visualization, we -provide helpers for wrapping output in original Detectron2 format. - -.. code:: ipython3 - - from detectron2.structures import Instances, Boxes - from detectron2.modeling.postprocessing import detector_postprocess - from detectron2.utils.visualizer import ColorMode, Visualizer - from detectron2.data import MetadataCatalog - import numpy as np - - - def postprocess_detection_result(outputs: Dict, orig_height: int, orig_width: int, conf_threshold: float = 0.0): - """ - Helper function for postprocessing prediction results - - Parameters: - outputs (Dict): OpenVINO model output dictionary - orig_height (int): original image height before preprocessing - orig_width (int): original image width before preprocessing - conf_threshold (float, optional, defaults 0.0): confidence threshold for valid prediction - Returns: - prediction_result (instances): postprocessed predicted instances - """ - boxes = outputs[0] - classes = outputs[1] - has_mask = len(outputs) >= 5 - masks = None if not has_mask else outputs[2] - scores = outputs[2 if not has_mask else 3] - model_input_size = ( - int(outputs[3 if not has_mask else 4][0]), - int(outputs[3 if not has_mask else 4][1]), - ) - filtered_detections = scores >= conf_threshold - boxes = Boxes(boxes[filtered_detections]) - scores = scores[filtered_detections] - classes = classes[filtered_detections] - out_dict = {"pred_boxes": boxes, "scores": scores, "pred_classes": classes} - if masks is not None: - masks = masks[filtered_detections] - out_dict["pred_masks"] = torch.from_numpy(masks) - instances = Instances(model_input_size, **out_dict) - return detector_postprocess(instances, orig_height, orig_width) - - - def draw_instance_prediction(img: np.ndarray, results: Instances, cfg: "Config"): - """ - Helper function for visualization prediction results - - Parameters: - img (np.ndarray): original image for drawing predictions - results (instances): model predictions - cfg (Config): model configuration - Returns: - img_with_res: image with results - """ - metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0]) - visualizer = Visualizer(img, metadata, instance_mode=ColorMode.IMAGE) - img_with_res = visualizer.draw_instance_predictions(results) - return img_with_res - -.. code:: ipython3 - - results = postprocess_detection_result(results, sample_input[0]["height"], sample_input[0]["width"], conf_threshold=0.05) - img_with_res = draw_instance_prediction(np.array(image), results, cfg) - Image.fromarray(img_with_res.get_image()) - - - - -.. image:: detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.png - - - -Instance Segmentation ---------------------- - - - -As it was discussed above, Detectron2 provides generic approach for -working with models for different use cases. The steps that required to -convert and run models pretrained for Instance Segmentation use case -will be very similar to Object Detection. - -Download Instance Segmentation PyTorch model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - model_name = "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x" - model, cfg = get_model_and_config(model_name) - sample_input = get_sample_inputs(image_file, cfg) - -Convert Instance Segmentation Model to OpenVINO Intermediate Representation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - model_xml_path = MODEL_DIR / (model_name.split("/")[-1] + ".xml") - - if not model_xml_path.exists(): - ov_model = convert_detectron2_model(model, sample_input) - ov.save_model(ov_model, MODEL_DIR / (model_name.split("/")[-1] + ".xml")) - else: - ov_model = model_xml_path - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Run Instance Segmentation model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -In comparison with Object Detection, Instance Segmentation models have -additional output that represents instance masks for each object. Our -postprocessing function handle this difference. - -.. code:: ipython3 - - compiled_model = core.compile_model(ov_model, device.value) - -.. code:: ipython3 - - results = compiled_model(sample_input[0]["image"]) - results = postprocess_detection_result(results, sample_input[0]["height"], sample_input[0]["width"], conf_threshold=0.05) - img_with_res = draw_instance_prediction(np.array(image), results, cfg) - Image.fromarray(img_with_res.get_image()) - - - - -.. image:: detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.png - - diff --git a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.jpg b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.jpg deleted file mode 100644 index b57bba091554c1..00000000000000 --- a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:22aba6ca94f8b35d4f5e31a5dbf3f709f6c2c5edc3fc70c2c9275ac748eca3f0 -size 59148 diff --git a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.png b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.png deleted file mode 100644 index 17d79e2fce5401..00000000000000 --- a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b85f6280877d4cd74d84c9c6ae8b280405e691ce718b192ac3b245fcd0fad3a8 -size 509097 diff --git a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.jpg b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.jpg deleted file mode 100644 index da8e188fd914cd..00000000000000 --- a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:94923b7a65515faa029473de128a2efdfda866de4c168baab0647853911c2758 -size 55624 diff --git a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.png b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.png deleted file mode 100644 index 85caea5d0083fd..00000000000000 --- a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9cf54566ec012d3e40b9b4adad5784f3a7b0b2162b260ed41de54970d22cd488 -size 460819 diff --git a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_8_0.jpg b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_8_0.jpg deleted file mode 100644 index 3754e67394bee6..00000000000000 --- a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_8_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c6e3fc4cbbcd709eea9f7a8301b958e102846528f38e96d1c5f597bd7e396b3c -size 46858 diff --git a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_8_0.png b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_8_0.png deleted file mode 100644 index 939857cbf951af..00000000000000 --- a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_8_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fab03000d5af3fdd1a22993c61583d1ee07eeb16064eb03f458a9b5649a3dc27 -size 503218 diff --git a/docs/notebooks/distil-whisper-asr-with-output.rst b/docs/notebooks/distil-whisper-asr-with-output.rst deleted file mode 100644 index 0b1abaa6f23148..00000000000000 --- a/docs/notebooks/distil-whisper-asr-with-output.rst +++ /dev/null @@ -1,1112 +0,0 @@ -Automatic speech recognition using Distil-Whisper and OpenVINO -============================================================== - -`Distil-Whisper `__ -is a distilled variant of the -`Whisper `__ model by -OpenAI. The Distil-Whisper is proposed in the paper `Robust Knowledge -Distillation via Large-Scale Pseudo -Labelling `__. According to authors, -compared to Whisper, Distil-Whisper runs in several times faster with -50% fewer parameters, while performing to within 1% word error rate -(WER) on out-of-distribution evaluation data. - -Whisper is a Transformer based encoder-decoder model, also referred to -as a sequence-to-sequence model. It maps a sequence of audio spectrogram -features to a sequence of text tokens. First, the raw audio inputs are -converted to a log-Mel spectrogram by action of the feature extractor. -Then, the Transformer encoder encodes the spectrogram to form a sequence -of encoder hidden states. Finally, the decoder autoregressively predicts -text tokens, conditional on both the previous tokens and the encoder -hidden states. - -You can see the model architecture in the diagram below: - -.. figure:: https://user-images.githubusercontent.com/29454499/204536571-8f6d8d77-5fbd-4c6d-8e29-14e734837860.svg - :alt: whisper_architecture.svg - - whisper_architecture.svg - -In this tutorial, we consider how to run Distil-Whisper using OpenVINO. -We will use the pre-trained model from the `Hugging Face -Transformers `__ -library. To simplify the user experience, the `Hugging Face -Optimum `__ library is used to -convert the model to OpenVINO™ IR format. To further improve OpenVINO -Distil-Whisper model performance ``INT8`` post-training quantization -from `NNCF `__ is applied. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Load PyTorch model <#load-pytorch-model>`__ - - - `Prepare input sample <#prepare-input-sample>`__ - - `Run model inference <#run-model-inference>`__ - -- `Load OpenVINO model using Optimum - library <#load-openvino-model-using-optimum-library>`__ - - - `Select Inference device <#select-inference-device>`__ - - `Compile OpenVINO model <#compile-openvino-model>`__ - - `Run OpenVINO model inference <#run-openvino-model-inference>`__ - -- `Compare performance PyTorch vs - OpenVINO <#compare-performance-pytorch-vs-openvino>`__ -- `Usage OpenVINO model with HuggingFace - pipelines <#usage-openvino-model-with-huggingface-pipelines>`__ -- `Quantization <#quantization>`__ - - - `Prepare calibration datasets <#prepare-calibration-datasets>`__ - - `Quantize Distil-Whisper encoder and decoder - models <#quantize-distil-whisper-encoder-and-decoder-models>`__ - - `Run quantized model inference <#run-quantized-model-inference>`__ - - `Compare performance and accuracy of the original and quantized - models <#compare-performance-and-accuracy-of-the-original-and-quantized-models>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "transformers>=4.35" "torch>=2.4.1" "onnx!=1.16.2" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" - %pip install -q "openvino>=2023.2.0" datasets "gradio>=4.19" "librosa" "soundfile" - %pip install -q "nncf>=2.6.0" "jiwer" - - import requests - import platform - from pathlib import Path - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("distil-whisper-asr.ipynb") - -Load PyTorch model ------------------- - - - -The ``AutoModelForSpeechSeq2Seq.from_pretrained`` method is used for the -initialization of PyTorch Whisper model using the transformers library. -By default, we will use the ``distil-whisper/distil-large-v2`` model as -an example in this tutorial. The model will be downloaded once during -first run and this process may require some time. - -You may also choose other models from `Distil-Whisper hugging face -collection `__ -such as ``distil-whisper/distil-medium.en`` or -``distil-whisper/distil-small.en``. Models of the original Whisper -architecture are also available, more on them -`here `__. - -Preprocessing and post-processing are important in this model use. -``AutoProcessor`` class used for initialization ``WhisperProcessor`` is -responsible for preparing audio input data for the model, converting it -to Mel-spectrogram and decoding predicted output token_ids into string -using tokenizer. - -.. code:: ipython3 - - import ipywidgets as widgets - - model_ids = { - "Distil-Whisper": [ - "distil-whisper/distil-large-v2", - "distil-whisper/distil-large-v3", - "distil-whisper/distil-medium.en", - "distil-whisper/distil-small.en", - ], - "Whisper": [ - "openai/whisper-large-v3-turbo", - "openai/whisper-large-v3", - "openai/whisper-large-v2", - "openai/whisper-large", - "openai/whisper-medium", - "openai/whisper-small", - "openai/whisper-base", - "openai/whisper-tiny", - "openai/whisper-medium.en", - "openai/whisper-small.en", - "openai/whisper-base.en", - "openai/whisper-tiny.en", - ], - } - - model_type = widgets.Dropdown( - options=model_ids.keys(), - value="Distil-Whisper", - description="Model type:", - disabled=False, - ) - - model_type - -.. code:: ipython3 - - model_id = widgets.Dropdown( - options=model_ids[model_type.value], - value=model_ids[model_type.value][0], - description="Model:", - disabled=False, - ) - - model_id - -.. code:: ipython3 - - from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq - - processor = AutoProcessor.from_pretrained(model_id.value) - - pt_model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id.value) - pt_model.eval(); - - -.. parsed-literal:: - - Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. - - -Prepare input sample -~~~~~~~~~~~~~~~~~~~~ - - - -The processor expects audio data in numpy array format and information -about the audio sampling rate and returns the ``input_features`` tensor -for making predictions. Conversion of audio to numpy format is handled -by Hugging Face datasets implementation. - -.. code:: ipython3 - - from datasets import load_dataset - - - def extract_input_features(sample): - input_features = processor( - sample["audio"]["array"], - sampling_rate=sample["audio"]["sampling_rate"], - return_tensors="pt", - ).input_features - return input_features - - - dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) - sample = dataset[0] - input_features = extract_input_features(sample) - -Run model inference -~~~~~~~~~~~~~~~~~~~ - - - -To perform speech recognition, one can use ``generate`` interface of the -model. After generation is finished processor.batch_decode can be used -for decoding predicted token_ids into text transcription. - -.. code:: ipython3 - - import IPython.display as ipd - - predicted_ids = pt_model.generate(input_features) - transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) - - display(ipd.Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])) - print(f"Reference: {sample['text']}") - print(f"Result: {transcription[0]}") - - - -.. raw:: html - - - - - - -.. parsed-literal:: - - Reference: MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL - Result: Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. - - -Load OpenVINO model using Optimum library ------------------------------------------ - - - -The Hugging Face Optimum API is a high-level API that enables us to -convert and quantize models from the Hugging Face Transformers library -to the OpenVINO™ IR format. For more details, refer to the `Hugging Face -Optimum -documentation `__. - -Optimum Intel can be used to load optimized models from the `Hugging -Face Hub `__ and -create pipelines to run an inference with OpenVINO Runtime using Hugging -Face APIs. The Optimum Inference models are API compatible with Hugging -Face Transformers models. This means we just need to replace the -``AutoModelForXxx`` class with the corresponding ``OVModelForXxx`` -class. - -Below is an example of the distil-whisper model - -.. code:: diff - - -from transformers import AutoModelForSpeechSeq2Seq - +from optimum.intel.openvino import OVModelForSpeechSeq2Seq - from transformers import AutoTokenizer, pipeline - - model_id = "distil-whisper/distil-large-v2" - -model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id) - +model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True) - -Model class initialization starts with calling the ``from_pretrained`` -method. When downloading and converting the Transformers model, the -parameter ``export=True`` should be added. We can save the converted -model for the next usage with the ``save_pretrained`` method. Tokenizers -and Processors are distributed with models also compatible with the -OpenVINO model. It means that we can reuse initialized early processor. - -.. code:: ipython3 - - from pathlib import Path - from optimum.intel.openvino import OVModelForSpeechSeq2Seq - - model_path = Path(model_id.value.replace("/", "_")) - ov_config = {"CACHE_DIR": ""} - - if not model_path.exists(): - ov_model = OVModelForSpeechSeq2Seq.from_pretrained( - model_id.value, - ov_config=ov_config, - export=True, - compile=False, - load_in_8bit=False, - ) - ov_model.half() - ov_model.save_pretrained(model_path) - else: - ov_model = OVModelForSpeechSeq2Seq.from_pretrained(model_path, ov_config=ov_config, compile=False) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino - - -Select Inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=4, options=('CPU', 'GPU.0', 'GPU.1', 'GPU.2', 'AUTO'), value='AUTO') - - - -Compile OpenVINO model -~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - ov_model.to(device.value) - ov_model.compile() - - -.. parsed-literal:: - - Compiling the encoder to AUTO ... - Compiling the decoder to AUTO ... - Compiling the decoder to AUTO ... - - -Run OpenVINO model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - predicted_ids = ov_model.generate(input_features) - transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) - - display(ipd.Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])) - print(f"Reference: {sample['text']}") - print(f"Result: {transcription[0]}") - - -.. parsed-literal:: - - /home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/optimum/intel/openvino/modeling_seq2seq.py:457: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly. - last_hidden_state = torch.from_numpy(self.request(inputs, shared_memory=True)["last_hidden_state"]).to( - /home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/optimum/intel/openvino/modeling_seq2seq.py:538: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly. - self.request.start_async(inputs, shared_memory=True) - - - -.. raw:: html - - - - - - -.. parsed-literal:: - - Reference: MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL - Result: Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. - - -Compare performance PyTorch vs OpenVINO ---------------------------------------- - - - -.. code:: ipython3 - - import time - import numpy as np - from tqdm.notebook import tqdm - - - def measure_perf(model, sample, n=10): - timers = [] - input_features = extract_input_features(sample) - for _ in tqdm(range(n), desc="Measuring performance"): - start = time.perf_counter() - model.generate(input_features) - end = time.perf_counter() - timers.append(end - start) - return np.median(timers) - -.. code:: ipython3 - - perf_torch = measure_perf(pt_model, sample) - perf_ov = measure_perf(ov_model, sample) - - - -.. parsed-literal:: - - Measuring performance: 0%| | 0/10 [00:00`__ -interface for ``automatic-speech-recognition``. Pipeline can be used for -long audio transcription. Distil-Whisper uses a chunked algorithm to -transcribe long-form audio files. In practice, this chunked long-form -algorithm is 9x faster than the sequential algorithm proposed by OpenAI -in the Whisper paper. To enable chunking, pass the chunk_length_s -parameter to the pipeline. For Distil-Whisper, a chunk length of 15 -seconds is optimal. To activate batching, pass the argument batch_size. - -.. code:: ipython3 - - from transformers import pipeline - import torch - - ov_model.generation_config = pt_model.generation_config - - pipe = pipeline( - "automatic-speech-recognition", - model=ov_model, - tokenizer=processor.tokenizer, - feature_extractor=processor.feature_extractor, - max_new_tokens=128, - chunk_length_s=15, - batch_size=16, - device=torch.device("cpu"), - ) - - -.. parsed-literal:: - - The model 'OVModelForWhisper' is not supported for automatic-speech-recognition. Supported models are ['Pop2PianoForConditionalGeneration', 'SeamlessM4TForSpeechToText', 'SpeechEncoderDecoderModel', 'Speech2TextForConditionalGeneration', 'SpeechT5ForSpeechToText', 'WhisperForConditionalGeneration', 'Data2VecAudioForCTC', 'HubertForCTC', 'MCTCTForCTC', 'SEWForCTC', 'SEWDForCTC', 'UniSpeechForCTC', 'UniSpeechSatForCTC', 'Wav2Vec2ForCTC', 'Wav2Vec2ConformerForCTC', 'WavLMForCTC']. - - -.. code:: ipython3 - - dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation", trust_remote_code=True) - sample_long = dataset[0] - - - def format_timestamp(seconds: float): - """ - format time in srt-file expected format - """ - assert seconds >= 0, "non-negative timestamp expected" - milliseconds = round(seconds * 1000.0) - - hours = milliseconds // 3_600_000 - milliseconds -= hours * 3_600_000 - - minutes = milliseconds // 60_000 - milliseconds -= minutes * 60_000 - - seconds = milliseconds // 1_000 - milliseconds -= seconds * 1_000 - - return (f"{hours}:" if hours > 0 else "00:") + f"{minutes:02d}:{seconds:02d},{milliseconds:03d}" - - - def prepare_srt(transcription): - """ - Format transcription into srt file format - """ - segment_lines = [] - for idx, segment in enumerate(transcription["chunks"]): - segment_lines.append(str(idx + 1) + "\n") - timestamps = segment["timestamp"] - time_start = format_timestamp(timestamps[0]) - time_end = format_timestamp(timestamps[1]) - time_str = f"{time_start} --> {time_end}\n" - segment_lines.append(time_str) - segment_lines.append(segment["text"] + "\n\n") - return segment_lines - -``return_timestamps`` argument allows getting timestamps of start and -end of speech associated with each processed chunk. It could be useful -in tasks like speech separation or generation of video subtitles. In -this example, we provide output formatting in SRT format, one of the -popular subtitles format. - -.. code:: ipython3 - - result = pipe(sample_long["audio"].copy(), return_timestamps=True) - -.. code:: ipython3 - - srt_lines = prepare_srt(result) - - display(ipd.Audio(sample_long["audio"]["array"], rate=sample_long["audio"]["sampling_rate"])) - print("".join(srt_lines)) - - - -.. raw:: html - - - - - - -.. parsed-literal:: - - 1 - 00:00:00,000 --> 00:00:06,560 - Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. - - 2 - 00:00:06,560 --> 00:00:11,280 - Nor is Mr. Quilter's manner less interesting than his matter. - - 3 - 00:00:11,280 --> 00:00:16,840 - He tells us that at this festive season of the year, with Christmas and roast beef looming - - 4 - 00:00:16,840 --> 00:00:23,760 - before us, similes drawn from eating and its results occur most readily to the mind. - - 5 - 00:00:23,760 --> 00:00:29,360 - He has grave doubts whether Sir Frederick Leighton's work is really Greek after all, and - - 6 - 00:00:29,360 --> 00:00:33,640 - can discover in it but little of Rocky Ithaca. - - 7 - 00:00:33,640 --> 00:00:39,760 - Lennel's pictures are a sort of upgards and Adam paintings, and Mason's exquisite - - 8 - 00:00:39,760 --> 00:00:44,720 - idles are as national as a jingo poem. - - 9 - 00:00:44,720 --> 00:00:50,320 - Mr. Burkett Foster's landscapes smile at one much in the same way that Mr. Carker used - - 10 - 00:00:50,320 --> 00:00:52,920 - to flash his teeth. - - 11 - 00:00:52,920 --> 00:00:58,680 - And Mr. John Collier gives his sitter a cheerful slap on the back, before he says, like - - 12 - 00:00:58,680 --> 00:01:01,120 - a shampooer and a Turkish bath, - - 13 - 00:01:01,120 --> 00:01:02,000 - Next man! - - - - -Quantization ------------- - - - -`NNCF `__ enables -post-training quantization by adding the quantization layers into the -model graph and then using a subset of the training dataset to -initialize the parameters of these additional quantization layers. The -framework is designed so that modifications to your original training -code are minor. - -The optimization process contains the following steps: - -1. Create a calibration dataset for quantization. -2. Run ``nncf.quantize`` to obtain quantized encoder and decoder models. -3. Serialize the ``INT8`` model using ``openvino.save_model`` function. - -.. - - **Note**: Quantization is time and memory consuming operation. - Running quantization code below may take some time. - -Please select below whether you would like to run Distil-Whisper -quantization. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - to_quantize = quantization_widget() - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - import requests - - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - %load_ext skip_kernel_extension - -Prepare calibration datasets -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -First step is to prepare calibration datasets for quantization. Since we -quantize whisper encoder and decoder separately, we need to prepare a -calibration dataset for each of the models. We import an -``InferRequestWrapper`` class that will intercept model inputs and -collect them to a list. Then we run model inference on some small amount -of audio samples. Generally, increasing the calibration dataset size -improves quantization quality. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from itertools import islice - from optimum.intel.openvino.quantization import InferRequestWrapper - - - def collect_calibration_dataset(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int): - # Overwrite model request properties, saving the original ones for restoring later - encoder_calibration_data = [] - decoder_calibration_data = [] - ov_model.encoder.request = InferRequestWrapper(ov_model.encoder.request, encoder_calibration_data, apply_caching=True) - ov_model.decoder.request = InferRequestWrapper(ov_model.decoder.request, - decoder_calibration_data, - apply_caching=True) - - try: - calibration_dataset = load_dataset("openslr/librispeech_asr", "clean", split="validation", streaming=True, trust_remote_code=True) - for sample in tqdm(islice(calibration_dataset, calibration_dataset_size), desc="Collecting calibration data", - total=calibration_dataset_size): - input_features = extract_input_features(sample) - ov_model.generate(input_features) - finally: - ov_model.encoder.request = ov_model.encoder.request.request - ov_model.decoder.request = ov_model.decoder.request.request - - return encoder_calibration_data, decoder_calibration_data - -Quantize Distil-Whisper encoder and decoder models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Below we run the ``quantize`` function which calls ``nncf.quantize`` on -Distil-Whisper encoder and decoder models. We don’t quantize -first-step-decoder because its share in whole inference time is -negligible. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import gc - import shutil - import nncf - import openvino as ov - - CALIBRATION_DATASET_SIZE = 50 - quantized_model_path = Path(f"{model_path}_quantized") - - - def quantize(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int): - if not quantized_model_path.exists(): - encoder_calibration_data, decoder_calibration_data = collect_calibration_dataset( - ov_model, calibration_dataset_size - ) - print("Quantizing encoder") - quantized_encoder = nncf.quantize( - ov_model.encoder.model, - nncf.Dataset(encoder_calibration_data), - subset_size=len(encoder_calibration_data), - model_type=nncf.ModelType.TRANSFORMER, - # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search - advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.50) - ) - ov.save_model(quantized_encoder, quantized_model_path / "openvino_encoder_model.xml") - del quantized_encoder - del encoder_calibration_data - gc.collect() - - print("Quantizing decoder with past") - quantized_decoder = nncf.quantize( - ov_model.decoder.model, - nncf.Dataset(decoder_calibration_data), - subset_size=len(decoder_calibration_data), - model_type=nncf.ModelType.TRANSFORMER, - # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search - advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.95) - ) - ov.save_model(quantized_decoder, quantized_model_path / "openvino_decoder_model.xml") - del quantized_decoder - del decoder_calibration_data - gc.collect() - - # Copy the config file and the first-step-decoder manually - shutil.copy(model_path / "config.json", quantized_model_path / "config.json") - - quantized_ov_model = OVModelForSpeechSeq2Seq.from_pretrained(quantized_model_path, ov_config=ov_config, compile=False) - quantized_ov_model.to(device.value) - quantized_ov_model.compile() - return quantized_ov_model - - - ov_quantized_model = quantize(ov_model, CALIBRATION_DATASET_SIZE) - - - -.. parsed-literal:: - - Collecting calibration data: 0%| | 0/10 [00:00 - - Your browser does not support the audio element. - - - - -.. parsed-literal:: - - Original : Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. - Quantized: Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. - - -Results are the same! - -Compare performance and accuracy of the original and quantized models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Finally, we compare original and quantized Distil-Whisper models from -accuracy and performance stand-points. - -To measure accuracy, we use ``1 - WER`` as a metric, where WER stands -for Word Error Rate. - -When measuring inference time, we do it separately for encoder and -decoder-with-past model forwards, and for the whole model inference too. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import time - from contextlib import contextmanager - from jiwer import wer, wer_standardize - - - TEST_DATASET_SIZE = 50 - MEASURE_TIME = False - - @contextmanager - def time_measurement(): - global MEASURE_TIME - try: - MEASURE_TIME = True - yield - finally: - MEASURE_TIME = False - - def time_fn(obj, fn_name, time_list): - original_fn = getattr(obj, fn_name) - - def wrapper(*args, **kwargs): - if not MEASURE_TIME: - return original_fn(\*args, \*\*kwargs) - start_time = time.perf_counter() - result = original_fn(\*args, \*\*kwargs) - end_time = time.perf_counter() - time_list.append(end_time - start_time) - return result - - setattr(obj, fn_name, wrapper) - - def calculate_transcription_time_and_accuracy(ov_model, test_samples): - encoder_infer_times = [] - decoder_with_past_infer_times = [] - whole_infer_times = [] - time_fn(ov_model, "generate", whole_infer_times) - time_fn(ov_model.encoder, "forward", encoder_infer_times) - time_fn(ov_model.decoder, "forward", decoder_with_past_infer_times) - - ground_truths = [] - predictions = [] - for data_item in tqdm(test_samples, desc="Measuring performance and accuracy"): - input_features = extract_input_features(data_item) - - with time_measurement(): - predicted_ids = ov_model.generate(input_features) - transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) - - ground_truths.append(data_item["text"]) - predictions.append(transcription[0]) - - word_accuracy = (1 - wer(ground_truths, predictions, reference_transform=wer_standardize, - hypothesis_transform=wer_standardize)) * 100 - mean_whole_infer_time = sum(whole_infer_times) - mean_encoder_infer_time = sum(encoder_infer_times) - mean_decoder_with_time_infer_time = sum(decoder_with_past_infer_times) - return word_accuracy, (mean_whole_infer_time, mean_encoder_infer_time, mean_decoder_with_time_infer_time) - - test_dataset = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True) - test_dataset = test_dataset.shuffle(seed=42).take(TEST_DATASET_SIZE) - test_samples = [sample for sample in test_dataset] - - accuracy_original, times_original = calculate_transcription_time_and_accuracy(ov_model, test_samples) - accuracy_quantized, times_quantized = calculate_transcription_time_and_accuracy(ov_quantized_model, test_samples) - print(f"Encoder performance speedup: {times_original[1] / times_quantized[1]:.3f}") - print(f"Decoder with past performance speedup: {times_original[2] / times_quantized[2]:.3f}") - print(f"Whole pipeline performance speedup: {times_original[0] / times_quantized[0]:.3f}") - print(f"Whisper transcription word accuracy. Original model: {accuracy_original:.2f}%. Quantized model: {accuracy_quantized:.2f}%.") - print(f"Accuracy drop: {accuracy_original - accuracy_quantized:.2f}%.") - - -.. parsed-literal:: - - Got disconnected from remote data host. Retrying in 5sec [1/20] - Got disconnected from remote data host. Retrying in 5sec [2/20] - - - -.. parsed-literal:: - - Measuring performance and accuracy: 0%| | 0/50 [00:00 MAX_AUDIO_MINS: - raise gr.Error( - f"To ensure fair usage of the Space, the maximum audio length permitted is {MAX_AUDIO_MINS} minutes." - f"Got an audio of length {round(audio_length_mins, 3)} minutes." - ) - - inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate} - - def _forward_ov_time(*args, **kwargs): - global ov_time - start_time = time.time() - result = pipe_forward(*args, **kwargs) - ov_time = time.time() - start_time - ov_time = round(ov_time, 2) - return result - - pipe._forward = _forward_ov_time - ov_text = pipe(inputs.copy(), batch_size=BATCH_SIZE)["text"] - return ov_text, ov_time - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/distil-whisper-asr/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=transcribe, quantized=to_quantize.value) - - try: - demo.launch(debug=False) - except Exception: - demo.launch(share=True, debug=False) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/dynamicrafter-animating-images-with-output.rst b/docs/notebooks/dynamicrafter-animating-images-with-output.rst deleted file mode 100644 index df140e566277b5..00000000000000 --- a/docs/notebooks/dynamicrafter-animating-images-with-output.rst +++ /dev/null @@ -1,1558 +0,0 @@ -Animating Open-domain Images with DynamiCrafter and OpenVINO -============================================================ - -Animating a still image offers an engaging visual experience. -Traditional image animation techniques mainly focus on animating natural -scenes with stochastic dynamics (e.g. clouds and fluid) or -domain-specific motions (e.g. human hair or body motions), and thus -limits their applicability to more general visual content. To overcome -this limitation, `DynamiCrafter -team `__ explores the -synthesis of dynamic content for open-domain images, converting them -into animated videos. The key idea is to utilize the motion prior of -text-to-video diffusion models by incorporating the image into the -generative process as guidance. Given an image, DynamiCrafter team first -projects it into a text-aligned rich context representation space using -a query transformer, which facilitates the video model to digest the -image content in a compatible fashion. However, some visual details -still struggle to be preserved in the resultant videos. To supplement -with more precise image information, DynamiCrafter team further feeds -the full image to the diffusion model by concatenating it with the -initial noises. Experimental results show that the proposed method can -produce visually convincing and more logical & natural motions, as well -as higher conformity to the input image. - -In this tutorial, we consider how to use DynamiCrafter with OpenVINO. An -additional part demonstrates how to run optimization with -`NNCF `__ to speed up model. - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - -
- -“bear playing guitar happily, snowing” - -.. raw:: html - - - -“boy walking on the street” - -.. raw:: html - -
- -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - -
- - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Load the original model <#load-the-original-model>`__ -- `Convert the model to OpenVINO - IR <#convert-the-model-to-openvino-ir>`__ - - - `Convert CLIP text encoder <#convert-clip-text-encoder>`__ - - `Convert CLIP image encoder <#convert-clip-image-encoder>`__ - - `Convert AE encoder <#convert-ae-encoder>`__ - - `Convert Diffusion U-Net model <#convert-diffusion-u-net-model>`__ - - `Convert AE decoder <#convert-ae-decoder>`__ - -- `Compiling models <#compiling-models>`__ -- `Building the pipeline <#building-the-pipeline>`__ -- `Run OpenVINO pipeline - inference <#run-openvino-pipeline-inference>`__ -- `Quantization <#quantization>`__ - - - `Prepare calibration dataset <#prepare-calibration-dataset>`__ - - `Run Quantization <#run-quantization>`__ - - `Run Weights Compression <#run-weights-compression>`__ - - `Compare model file sizes <#compare-model-file-sizes>`__ - - `Compare inference time of the FP32 and INT8 - pipelines <#compare-inference-time-of-the-fp32-and-int8-pipelines>`__ - -- `Interactive inference <#interactive-inference>`__ - - - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "openvino>=2024.2.0" "nncf>=2.11.0" "datasets>=2.20.0" - %pip install -q "gradio>=4.19" omegaconf einops pytorch_lightning kornia "open_clip_torch==2.22.0" transformers av opencv-python "torch==2.2.2" "av==12.0.0" --extra-index-url https://download.pytorch.org/whl/cpu - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - - -.. code:: ipython3 - - from pathlib import Path - import requests - - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("dynamicrafter-animating-images.ipynb") - -.. code:: ipython3 - - from cmd_helper import clone_repo - - - clone_repo("https://github.com/Doubiiu/DynamiCrafter.git", "26e665cd6c174234238d2ded661e2e56f875d360") - - - - -.. parsed-literal:: - - PosixPath('DynamiCrafter') - - - -Load and run the original pipeline ----------------------------------- - - - -We will use model for 256x256 resolution as example. Also, models for -320x512 and 576x1024 are -`available `__. - -.. code:: ipython3 - - import os - from collections import OrderedDict - - import torch - from huggingface_hub import hf_hub_download - from omegaconf import OmegaConf - - from utils.utils import instantiate_from_config - - - def load_model_checkpoint(model, ckpt): - def load_checkpoint(model, ckpt, full_strict): - state_dict = torch.load(ckpt, map_location="cpu") - if "state_dict" in list(state_dict.keys()): - state_dict = state_dict["state_dict"] - try: - model.load_state_dict(state_dict, strict=full_strict) - except Exception: - ## rename the keys for 256x256 model - new_pl_sd = OrderedDict() - for k, v in state_dict.items(): - new_pl_sd[k] = v - - for k in list(new_pl_sd.keys()): - if "framestride_embed" in k: - new_key = k.replace("framestride_embed", "fps_embedding") - new_pl_sd[new_key] = new_pl_sd[k] - del new_pl_sd[k] - model.load_state_dict(new_pl_sd, strict=full_strict) - else: - ## deepspeed - new_pl_sd = OrderedDict() - for key in state_dict["module"].keys(): - new_pl_sd[key[16:]] = state_dict["module"][key] - model.load_state_dict(new_pl_sd, strict=full_strict) - - return model - - load_checkpoint(model, ckpt, full_strict=True) - print(">>> model checkpoint loaded.") - return model - - - def download_model(): - REPO_ID = "Doubiiu/DynamiCrafter" - if not os.path.exists("./checkpoints/dynamicrafter_256_v1/"): - os.makedirs("./checkpoints/dynamicrafter_256_v1/") - local_file = os.path.join("./checkpoints/dynamicrafter_256_v1/model.ckpt") - if not os.path.exists(local_file): - hf_hub_download(repo_id=REPO_ID, filename="model.ckpt", local_dir="./checkpoints/dynamicrafter_256_v1/", local_dir_use_symlinks=False) - - ckpt_path = "checkpoints/dynamicrafter_256_v1/model.ckpt" - config_file = "DynamiCrafter/configs/inference_256_v1.0.yaml" - config = OmegaConf.load(config_file) - model_config = config.pop("model", OmegaConf.create()) - model_config["params"]["unet_config"]["params"]["use_checkpoint"] = False - model = instantiate_from_config(model_config) - model = load_model_checkpoint(model, ckpt_path) - model.eval() - - return model - - - model = download_model() - - -.. parsed-literal:: - - Note: switching to '26e665cd6c174234238d2ded661e2e56f875d360'. - - You are in 'detached HEAD' state. You can look around, make experimental - changes and commit them, and you can discard any commits you make in this - state without impacting any branches by switching back to a branch. - - If you want to create a new branch to retain commits you create, you may - do so (now or later) by using -c with the switch command. Example: - - git switch -c - - Or undo this operation with: - - git switch - - - Turn off this advice by setting config variable advice.detachedHead to false - - HEAD is now at 26e665c add dataset - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/huggingface_hub/file_download.py:1204: UserWarning: `local_dir_use_symlinks` parameter is deprecated and will be ignored. The process to download files to a local folder has been updated and do not rely on symlinks anymore. You only need to pass a destination folder as`local_dir`. - For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder. - warnings.warn( - - - -.. parsed-literal:: - - model.ckpt: 0%| | 0.00/10.4G [00:00>> model checkpoint loaded. - - -Convert the model to OpenVINO IR --------------------------------- - - - -Let’s define the conversion function for PyTorch modules. We use -``ov.convert_model`` function to obtain OpenVINO Intermediate -Representation object and ``ov.save_model`` function to save it as XML -file. - -.. code:: ipython3 - - import gc - - import openvino as ov - - - def convert(model: torch.nn.Module, xml_path: str, example_input, input_shape=None): - xml_path = Path(xml_path) - if not xml_path.exists(): - xml_path.parent.mkdir(parents=True, exist_ok=True) - with torch.no_grad(): - if not input_shape: - converted_model = ov.convert_model(model, example_input=example_input) - else: - converted_model = ov.convert_model(model, example_input=example_input, input=input_shape) - ov.save_model(converted_model, xml_path, compress_to_fp16=False) - - # cleanup memory - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - -Flowchart of DynamiCrafter proposed in `the -paper `__: - -|schema| Description: > During inference, our model can generate -animation clips from noise conditioned on the input still image. - -Let’s convert models from the pipeline one by one. - -.. |schema| image:: https://github.com/openvinotoolkit/openvino_notebooks/assets/76171391/d1033876-c664-4345-a254-0649edbf1906 - -Convert CLIP text encoder -~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from lvdm.modules.encoders.condition import FrozenOpenCLIPEmbedder - - MODEL_DIR = Path("models") - - COND_STAGE_MODEL_OV_PATH = MODEL_DIR / "cond_stage_model.xml" - - - class FrozenOpenCLIPEmbedderWrapper(FrozenOpenCLIPEmbedder): - def forward(self, tokens): - z = self.encode_with_transformer(tokens.to(self.device)) - return z - - - cond_stage_model = FrozenOpenCLIPEmbedderWrapper(device="cpu") - - if not COND_STAGE_MODEL_OV_PATH.exists(): - convert( - cond_stage_model, - COND_STAGE_MODEL_OV_PATH, - example_input=torch.ones([1, 77], dtype=torch.long), - ) - - del cond_stage_model - gc.collect(); - - -.. parsed-literal:: - - WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11. - - -.. parsed-literal:: - - WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11. - - -Convert CLIP image encoder -~~~~~~~~~~~~~~~~~~~~~~~~~~ - - -``FrozenOpenCLIPImageEmbedderV2`` model accepts images of various -resolutions. - -.. code:: ipython3 - - EMBEDDER_OV_PATH = MODEL_DIR / "embedder_ir.xml" - - - dummy_input = torch.rand([1, 3, 767, 767], dtype=torch.float32) - - model.embedder.model.visual.input_patchnorm = None # fix error: visual model has not attribute 'input_patchnorm' - if not EMBEDDER_OV_PATH.exists(): - convert(model.embedder, EMBEDDER_OV_PATH, example_input=dummy_input, input_shape=[1, 3, -1, -1]) - - - del model.embedder - gc.collect(); - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/utils/image.py:226: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if input.numel() == 0: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/geometry/transform/affwarp.py:573: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if size == input_size: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/geometry/transform/affwarp.py:579: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - antialias = antialias and (max(factors) > 1) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/geometry/transform/affwarp.py:581: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if antialias: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/geometry/transform/affwarp.py:584: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - sigmas = (max((factors[0] - 1.0) / 2.0, 0.001), max((factors[1] - 1.0) / 2.0, 0.001)) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/geometry/transform/affwarp.py:589: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3)) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/geometry/transform/affwarp.py:589: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3)) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/filters/gaussian.py:55: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. - sigma = tensor([sigma], device=input.device, dtype=input.dtype) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/filters/gaussian.py:55: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - sigma = tensor([sigma], device=input.device, dtype=input.dtype) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/core/check.py:78: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if x_shape_to_check[i] != dim: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/filters/kernels.py:92: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. - mean = tensor([[mean]], device=sigma.device, dtype=sigma.dtype) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:101: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if len(mean.shape) == 0 or mean.shape[0] == 1: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:103: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if len(std.shape) == 0 or std.shape[0] == 1: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:107: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if mean.shape and mean.shape[0] != 1: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:108: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if mean.shape[0] != data.shape[1] and mean.shape[:2] != data.shape[:2]: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:112: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if std.shape and std.shape[0] != 1: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:113: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if std.shape[0] != data.shape[1] and std.shape[:2] != data.shape[:2]: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:116: TracerWarning: torch.as_tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. - mean = torch.as_tensor(mean, device=data.device, dtype=data.dtype) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:117: TracerWarning: torch.as_tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. - std = torch.as_tensor(std, device=data.device, dtype=data.dtype) - - -Convert AE encoder -~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - ENCODER_FIRST_STAGE_OV_PATH = MODEL_DIR / "encoder_first_stage_ir.xml" - - - dummy_input = torch.rand([1, 3, 256, 256], dtype=torch.float32) - - if not ENCODER_FIRST_STAGE_OV_PATH.exists(): - convert( - model.first_stage_model.encoder, - ENCODER_FIRST_STAGE_OV_PATH, - example_input=dummy_input, - ) - - del model.first_stage_model.encoder - gc.collect(); - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/dynamicrafter-animating-images/DynamiCrafter/lvdm/modules/networks/ae_modules.py:67: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - w_ = w_ * (int(c)**(-0.5)) - - -Convert Diffusion U-Net model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - MODEL_OV_PATH = MODEL_DIR / "model_ir.xml" - - - class ModelWrapper(torch.nn.Module): - def __init__(self, diffusion_model): - super().__init__() - self.diffusion_model = diffusion_model - - def forward(self, xc, t, context=None, fs=None, temporal_length=None): - outputs = self.diffusion_model(xc, t, context=context, fs=fs, temporal_length=temporal_length) - return outputs - - - if not MODEL_OV_PATH.exists(): - convert( - ModelWrapper(model.model.diffusion_model), - MODEL_OV_PATH, - example_input={ - "xc": torch.rand([1, 8, 16, 32, 32], dtype=torch.float32), - "t": torch.tensor([1]), - "context": torch.rand([1, 333, 1024], dtype=torch.float32), - "fs": torch.tensor([3]), - "temporal_length": torch.tensor([16]), - }, - ) - - out_channels = model.model.diffusion_model.out_channels - del model.model.diffusion_model - gc.collect(); - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/dynamicrafter-animating-images/DynamiCrafter/lvdm/modules/networks/openaimodel3d.py:556: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if l_context == 77 + t*16: ## !!! HARD CODE here - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/dynamicrafter-animating-images/DynamiCrafter/lvdm/modules/networks/openaimodel3d.py:205: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if batch_size: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/dynamicrafter-animating-images/DynamiCrafter/lvdm/modules/networks/openaimodel3d.py:232: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if self.use_temporal_conv and batch_size: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/dynamicrafter-animating-images/DynamiCrafter/lvdm/modules/networks/openaimodel3d.py:76: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert x.shape[1] == self.channels - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/dynamicrafter-animating-images/DynamiCrafter/lvdm/modules/networks/openaimodel3d.py:99: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert x.shape[1] == self.channels - - -Convert AE decoder -~~~~~~~~~~~~~~~~~~ - -``Decoder`` receives a -``bfloat16`` tensor. numpy doesn’t support this type. To avoid problems -with the conversion lets replace ``decode`` method to convert bfloat16 -to float32. - -.. code:: ipython3 - - import types - - - def decode(self, z, **kwargs): - z = self.post_quant_conv(z) - z = z.float() - dec = self.decoder(z) - return dec - - - model.first_stage_model.decode = types.MethodType(decode, model.first_stage_model) - -.. code:: ipython3 - - DECODER_FIRST_STAGE_OV_PATH = MODEL_DIR / "decoder_first_stage_ir.xml" - - - dummy_input = torch.rand([16, 4, 32, 32], dtype=torch.float32) - - if not DECODER_FIRST_STAGE_OV_PATH.exists(): - convert( - model.first_stage_model.decoder, - DECODER_FIRST_STAGE_OV_PATH, - example_input=dummy_input, - ) - - del model.first_stage_model.decoder - gc.collect(); - -Compiling models ----------------- - - - -Select device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - from notebook_utils import device_widget - - core = ov.Core() - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - cond_stage_model = core.read_model(COND_STAGE_MODEL_OV_PATH) - encoder_first_stage = core.read_model(ENCODER_FIRST_STAGE_OV_PATH) - embedder = core.read_model(EMBEDDER_OV_PATH) - model_ov = core.read_model(MODEL_OV_PATH) - decoder_first_stage = core.read_model(DECODER_FIRST_STAGE_OV_PATH) - - compiled_cond_stage_model = core.compile_model(cond_stage_model, device.value) - compiled_encode_first_stage = core.compile_model(encoder_first_stage, device.value) - compiled_embedder = core.compile_model(embedder, device.value) - compiled_model = core.compile_model(model_ov, device.value) - compiled_decoder_first_stage = core.compile_model(decoder_first_stage, device.value) - -Building the pipeline ---------------------- - - - -Let’s create callable wrapper classes for compiled models to allow -interaction with original pipelines. Note that all of wrapper classes -return ``torch.Tensor``\ s instead of ``np.array``\ s. - -.. code:: ipython3 - - from typing import Any - import open_clip - - - class CondStageModelWrapper(torch.nn.Module): - def __init__(self, cond_stage_model): - super().__init__() - self.cond_stage_model = cond_stage_model - - def encode(self, tokens): - if isinstance(tokens, list): - tokens = open_clip.tokenize(tokens[0]) - outs = self.cond_stage_model(tokens)[0] - - return torch.from_numpy(outs) - - - class EncoderFirstStageModelWrapper(torch.nn.Module): - def __init__(self, encode_first_stage): - super().__init__() - self.encode_first_stage = encode_first_stage - - def forward(self, x): - outs = self.encode_first_stage(x)[0] - - return torch.from_numpy(outs) - - def __call__(self, *args: Any, **kwargs: Any) -> Any: - return self.forward(*args, **kwargs) - - - class EmbedderWrapper(torch.nn.Module): - def __init__(self, embedder): - super().__init__() - self.embedder = embedder - - def forward(self, x): - outs = self.embedder(x)[0] - - return torch.from_numpy(outs) - - def __call__(self, *args: Any, **kwargs: Any) -> Any: - return self.forward(*args, **kwargs) - - - class CModelWrapper(torch.nn.Module): - def __init__(self, diffusion_model, out_channels): - super().__init__() - self.diffusion_model = diffusion_model - self.out_channels = out_channels - - def forward(self, xc, t, context, fs, temporal_length): - inputs = { - "xc": xc, - "t": t, - "context": context, - "fs": fs, - } - outs = self.diffusion_model(inputs)[0] - - return torch.from_numpy(outs) - - def __call__(self, *args: Any, **kwargs: Any) -> Any: - return self.forward(*args, **kwargs) - - - class DecoderFirstStageModelWrapper(torch.nn.Module): - def __init__(self, decoder_first_stage): - super().__init__() - self.decoder_first_stage = decoder_first_stage - - def forward(self, x): - x.float() - outs = self.decoder_first_stage(x)[0] - - return torch.from_numpy(outs) - - def __call__(self, *args: Any, **kwargs: Any) -> Any: - return self.forward(*args, **kwargs) - -And insert wrappers instances in the pipeline: - -.. code:: ipython3 - - model.cond_stage_model = CondStageModelWrapper(compiled_cond_stage_model) - model.first_stage_model.encoder = EncoderFirstStageModelWrapper(compiled_encode_first_stage) - model.embedder = EmbedderWrapper(compiled_embedder) - model.model.diffusion_model = CModelWrapper(compiled_model, out_channels) - model.first_stage_model.decoder = DecoderFirstStageModelWrapper(compiled_decoder_first_stage) - -Run OpenVINO pipeline inference -------------------------------- - - - -.. code:: ipython3 - - from einops import repeat, rearrange - import torchvision.transforms as transforms - - - transform = transforms.Compose( - [ - transforms.Resize(min((256, 256))), - transforms.CenterCrop((256, 256)), - ] - ) - - - def get_latent_z(model, videos): - b, c, t, h, w = videos.shape - x = rearrange(videos, "b c t h w -> (b t) c h w") - z = model.encode_first_stage(x) - z = rearrange(z, "(b t) c h w -> b c t h w", b=b, t=t) - return z - - - def process_input(model, prompt, image, transform=transform, fs=3): - text_emb = model.get_learned_conditioning([prompt]) - - # img cond - img_tensor = torch.from_numpy(image).permute(2, 0, 1).float().to(model.device) - img_tensor = (img_tensor / 255.0 - 0.5) * 2 - - image_tensor_resized = transform(img_tensor) # 3,h,w - videos = image_tensor_resized.unsqueeze(0) # bchw - - z = get_latent_z(model, videos.unsqueeze(2)) # bc,1,hw - frames = model.temporal_length - img_tensor_repeat = repeat(z, "b c t h w -> b c (repeat t) h w", repeat=frames) - - cond_images = model.embedder(img_tensor.unsqueeze(0)) # blc - img_emb = model.image_proj_model(cond_images) - imtext_cond = torch.cat([text_emb, img_emb], dim=1) - - fs = torch.tensor([fs], dtype=torch.long, device=model.device) - cond = {"c_crossattn": [imtext_cond], "fs": fs, "c_concat": [img_tensor_repeat]} - return cond - -.. code:: ipython3 - - import time - from PIL import Image - import numpy as np - from lvdm.models.samplers.ddim import DDIMSampler - from pytorch_lightning import seed_everything - import torchvision - - - def register_buffer(self, name, attr): - if isinstance(attr, torch.Tensor): - if attr.device != torch.device("cpu"): - attr = attr.to(torch.device("cpu")) - setattr(self, name, attr) - - - def batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=50, ddim_eta=1.0, cfg_scale=1.0, temporal_cfg_scale=None, **kwargs): - ddim_sampler = DDIMSampler(model) - uncond_type = model.uncond_type - batch_size = noise_shape[0] - fs = cond["fs"] - del cond["fs"] - if noise_shape[-1] == 32: - timestep_spacing = "uniform" - guidance_rescale = 0.0 - else: - timestep_spacing = "uniform_trailing" - guidance_rescale = 0.7 - # construct unconditional guidance - if cfg_scale != 1.0: - if uncond_type == "empty_seq": - prompts = batch_size * [""] - # prompts = N * T * [""] ## if is_imgbatch=True - uc_emb = model.get_learned_conditioning(prompts) - elif uncond_type == "zero_embed": - c_emb = cond["c_crossattn"][0] if isinstance(cond, dict) else cond - uc_emb = torch.zeros_like(c_emb) - - # process image embedding token - if hasattr(model, "embedder"): - uc_img = torch.zeros(noise_shape[0], 3, 224, 224).to(model.device) - ## img: b c h w >> b l c - uc_img = model.embedder(uc_img) - uc_img = model.image_proj_model(uc_img) - uc_emb = torch.cat([uc_emb, uc_img], dim=1) - - if isinstance(cond, dict): - uc = {key: cond[key] for key in cond.keys()} - uc.update({"c_crossattn": [uc_emb]}) - else: - uc = uc_emb - else: - uc = None - - x_T = None - batch_variants = [] - - for _ in range(n_samples): - if ddim_sampler is not None: - kwargs.update({"clean_cond": True}) - samples, _ = ddim_sampler.sample( - S=ddim_steps, - conditioning=cond, - batch_size=noise_shape[0], - shape=noise_shape[1:], - verbose=False, - unconditional_guidance_scale=cfg_scale, - unconditional_conditioning=uc, - eta=ddim_eta, - temporal_length=noise_shape[2], - conditional_guidance_scale_temporal=temporal_cfg_scale, - x_T=x_T, - fs=fs, - timestep_spacing=timestep_spacing, - guidance_rescale=guidance_rescale, - **kwargs, - ) - # reconstruct from latent to pixel space - batch_images = model.decode_first_stage(samples) - batch_variants.append(batch_images) - # batch, , c, t, h, w - batch_variants = torch.stack(batch_variants, dim=1) - return batch_variants - - - # monkey patching to replace the original method 'register_buffer' that uses CUDA - DDIMSampler.register_buffer = types.MethodType(register_buffer, DDIMSampler) - - - def save_videos(batch_tensors, savedir, filenames, fps=10): - # b,samples,c,t,h,w - n_samples = batch_tensors.shape[1] - for idx, vid_tensor in enumerate(batch_tensors): - video = vid_tensor.detach().cpu() - video = torch.clamp(video.float(), -1.0, 1.0) - video = video.permute(2, 0, 1, 3, 4) # t,n,c,h,w - frame_grids = [torchvision.utils.make_grid(framesheet, nrow=int(n_samples)) for framesheet in video] # [3, 1*h, n*w] - grid = torch.stack(frame_grids, dim=0) # stack in temporal dim [t, 3, n*h, w] - grid = (grid + 1.0) / 2.0 - grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1) - savepath = os.path.join(savedir, f"{filenames[idx]}.mp4") - torchvision.io.write_video(savepath, grid, fps=fps, video_codec="h264", options={"crf": "10"}) - - - def get_image(image, prompt, steps=5, cfg_scale=7.5, eta=1.0, fs=3, seed=123, model=model, result_dir="results"): - if not os.path.exists(result_dir): - os.mkdir(result_dir) - - seed_everything(seed) - - # torch.cuda.empty_cache() - print("start:", prompt, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))) - start = time.time() - if steps > 60: - steps = 60 - model = model.cpu() - batch_size = 1 - channels = model.model.diffusion_model.out_channels - frames = model.temporal_length - h, w = 256 // 8, 256 // 8 - noise_shape = [batch_size, channels, frames, h, w] - - # text cond - with torch.no_grad(), torch.cpu.amp.autocast(): - cond = process_input(model, prompt, image, transform, fs=3) - - ## inference - batch_samples = batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=steps, ddim_eta=eta, cfg_scale=cfg_scale) - ## b,samples,c,t,h,w - prompt_str = prompt.replace("/", "_slash_") if "/" in prompt else prompt - prompt_str = prompt_str.replace(" ", "_") if " " in prompt else prompt_str - prompt_str = prompt_str[:40] - if len(prompt_str) == 0: - prompt_str = "empty_prompt" - - save_videos(batch_samples, result_dir, filenames=[prompt_str], fps=8) - print(f"Saved in {prompt_str}.mp4. Time used: {(time.time() - start):.2f} seconds") - - return os.path.join(result_dir, f"{prompt_str}.mp4") - -.. code:: ipython3 - - image_path = "DynamiCrafter/prompts/256/art.png" - prompt = "man fishing in a boat at sunset" - seed = 234 - image = Image.open(image_path) - image = np.asarray(image) - result_dir = "results" - video_path = get_image(image, prompt, steps=20, seed=seed, model=model, result_dir=result_dir) - - -.. parsed-literal:: - - Seed set to 234 - /tmp/ipykernel_3297302/2451984876.py:25: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.) - img_tensor = torch.from_numpy(image).permute(2, 0, 1).float().to(model.device) - - -.. parsed-literal:: - - start: man fishing in a boat at sunset 2025-02-04 00:28:55 - Saved in man_fishing_in_a_boat_at_sunset.mp4. Time used: 201.18 seconds - - -.. code:: ipython3 - - from IPython.display import HTML - - HTML( - f""" - - """ - ) - - - - -.. raw:: html - - - - - - - -Quantization ------------- - - - -`NNCF `__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -According to ``DynamiCrafter`` structure, denoising UNet model is used -in the cycle repeating inference on each diffusion step, while other -parts of pipeline take part only once. Now we will show you how to -optimize pipeline using -`NNCF `__ to reduce memory and -computation cost. - -Please select below whether you would like to run quantization to -improve model inference speed. - - **NOTE**: Quantization is time and memory consuming operation. - Running quantization code below may take some time. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - to_quantize = quantization_widget() - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -Let’s load ``skip magic`` extension to skip quantization if -``to_quantize`` is not selected - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - import requests - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - int8_model = None - MODEL_INT8_OV_PATH = MODEL_DIR / "model_ir_int8.xml" - COND_STAGE_MODEL_INT8_OV_PATH = MODEL_DIR / "cond_stage_model_int8.xml" - DECODER_FIRST_STAGE_INT8_OV_PATH = MODEL_DIR / "decoder_first_stage_ir_int8.xml" - ENCODER_FIRST_STAGE_INT8_OV_PATH = MODEL_DIR / "encoder_first_stage_ir_int8.xml" - EMBEDDER_INT8_OV_PATH = MODEL_DIR / "embedder_ir_int8.xml" - - %load_ext skip_kernel_extension - -Prepare calibration dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We use a portion of -`jovianzm/Pexels-400k `__ -dataset from Hugging Face as calibration data. - -.. code:: ipython3 - - from io import BytesIO - - - def download_image(url): - try: - response = requests.get(url) - response.raise_for_status() - img = Image.open(BytesIO(response.content)) - # Convert the image to a NumPy array - img_array = np.array(img) - return img_array - except Exception as err: - print(f"Error occurred: {err}") - return None - -To collect intermediate model inputs for calibration we should customize -``CompiledModel``. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import datasets - from tqdm.notebook import tqdm - import pickle - - - class CompiledModelDecorator(ov.CompiledModel): - def __init__(self, compiled_model, keep_prob, data_cache = None): - super().__init__(compiled_model) - self.data_cache = data_cache if data_cache else [] - self.keep_prob = np.clip(keep_prob, 0, 1) - - def __call__(self, *args, **kwargs): - if np.random.rand() <= self.keep_prob: - self.data_cache.append(*args) - return super().__call__(*args, **kwargs) - - def collect_calibration_data(model, subset_size): - calibration_dataset_filepath = Path("calibration_data")/f"{subset_size}.pkl" - calibration_dataset_filepath.parent.mkdir(exist_ok=True, parents=True) - if not calibration_dataset_filepath.exists(): - original_diffusion_model = model.model.diffusion_model.diffusion_model - modified_model = CompiledModelDecorator(original_diffusion_model, keep_prob=1) - model.model.diffusion_model = CModelWrapper(modified_model, model.model.diffusion_model.out_channels) - - dataset = datasets.load_dataset("google-research-datasets/conceptual_captions", trust_remote_code=True, split="train", streaming=True).shuffle(seed=42).take(subset_size) - - pbar = tqdm(total=subset_size) - channels = model.model.diffusion_model.out_channels - frames = model.temporal_length - h, w = 256 // 8, 256 // 8 - noise_shape = [1, channels, frames, h, w] - for batch in dataset: - prompt = batch["caption"] - image_path = batch["image_url"] - image = download_image(image_path) - if image is None: - continue - - cond = process_input(model, prompt, image) - batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=20, ddim_eta=1.0, cfg_scale=7.5) - - collected_subset_size = len(model.model.diffusion_model.diffusion_model.data_cache) - if collected_subset_size >= subset_size: - pbar.update(subset_size - pbar.n) - break - pbar.update(collected_subset_size - pbar.n) - - calibration_dataset = model.model.diffusion_model.diffusion_model.data_cache[:subset_size] - model.model.diffusion_model.diffusion_model = original_diffusion_model - with open(calibration_dataset_filepath, 'wb') as f: - pickle.dump(calibration_dataset, f) - with open(calibration_dataset_filepath, 'rb') as f: - calibration_dataset = pickle.load(f) - return calibration_dataset - -.. code:: ipython3 - - %%skip not $to_quantize.value - - - if not MODEL_INT8_OV_PATH.exists(): - subset_size = 300 - calibration_data = collect_calibration_data(model, subset_size=subset_size) - - - -.. parsed-literal:: - - 0%| | 0/300 [00:00>> model checkpoint loaded. - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - image_path = "DynamiCrafter/prompts/256/art.png" - prompt = "man fishing in a boat at sunset" - seed = 234 - image = Image.open(image_path) - image = np.asarray(image) - - result_dir = "results_int8" - video_path = get_image(image, prompt, steps=20, seed=seed, model=int8_model, result_dir=result_dir) - - -.. parsed-literal:: - - Seed set to 234 - - -.. parsed-literal:: - - start: man fishing in a boat at sunset 2025-02-04 01:58:47 - Saved in man_fishing_in_a_boat_at_sunset.mp4. Time used: 105.65 seconds - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from IPython.display import display, HTML - - display(HTML(f""" - - """)) - - - -.. raw:: html - - - - - - -Compare model file sizes -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - fp32_model_paths = [COND_STAGE_MODEL_OV_PATH, DECODER_FIRST_STAGE_OV_PATH, ENCODER_FIRST_STAGE_OV_PATH, EMBEDDER_OV_PATH, MODEL_OV_PATH] - int8_model_paths = [COND_STAGE_MODEL_INT8_OV_PATH, DECODER_FIRST_STAGE_INT8_OV_PATH, ENCODER_FIRST_STAGE_INT8_OV_PATH, EMBEDDER_INT8_OV_PATH, MODEL_INT8_OV_PATH] - - for fp16_path, int8_path in zip(fp32_model_paths, int8_model_paths): - fp32_ir_model_size = fp16_path.with_suffix(".bin").stat().st_size - int8_model_size = int8_path.with_suffix(".bin").stat().st_size - print(f"{fp16_path.stem} compression rate: {fp32_ir_model_size / int8_model_size:.3f}") - - -.. parsed-literal:: - - cond_stage_model compression rate: 3.977 - decoder_first_stage_ir compression rate: 3.987 - encoder_first_stage_ir compression rate: 3.986 - embedder_ir compression rate: 3.977 - model_ir compression rate: 3.981 - - -Compare inference time of the FP32 and INT8 models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To measure the inference performance of the ``FP32`` and ``INT8`` -models, we use median inference time on calibration subset. - - **NOTE**: For the most accurate performance estimation, it is - recommended to run ``benchmark_app`` in a terminal/command prompt - after closing other applications. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import time - - - def calculate_inference_time(model, validation_size=3): - calibration_dataset = datasets.load_dataset("jovianzm/Pexels-400k", split="train", streaming=True).take(validation_size) - inference_time = [] - channels = model.model.diffusion_model.out_channels - frames = model.temporal_length - h, w = 256 // 8, 256 // 8 - noise_shape = [1, channels, frames, h, w] - for batch in calibration_dataset: - prompt = batch["title"] - image_path = batch["thumbnail"] - image = download_image(image_path) - cond = process_input(model, prompt, image, transform, fs=3) - - start = time.perf_counter() - _ = batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=20, ddim_eta=1.0, cfg_scale=7.5) - end = time.perf_counter() - delta = end - start - inference_time.append(delta) - return np.median(inference_time) - -.. code:: ipython3 - - %%skip not $to_quantize.value - - fp_latency = calculate_inference_time(model) - print(f"FP32 latency: {fp_latency:.3f}") - int8_latency = calculate_inference_time(int8_model) - print(f"INT8 latency: {int8_latency:.3f}") - print(f"Performance speed up: {fp_latency / int8_latency:.3f}") - - -.. parsed-literal:: - - FP32 latency: 192.408 - INT8 latency: 97.997 - Performance speed up: 1.963 - - -Interactive inference ---------------------- - - - -Please select below whether you would like to use the quantized models -to launch the interactive demo. - -.. code:: ipython3 - - from ipywidgets import widgets - - quantized_models_present = int8_model is not None - - use_quantized_models = widgets.Checkbox( - value=quantized_models_present, - description="Use quantized models", - disabled=not quantized_models_present, - ) - - use_quantized_models - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Use quantized models') - - - -.. code:: ipython3 - - from functools import partial - - demo_model = int8_model if use_quantized_models.value else model - get_image_fn = partial(get_image, model=demo_model) - - if not Path("gradio_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/dynamicrafter-animating-images/gradio_helper.py" - ) - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=get_image_fn) - - try: - demo.queue().launch(debug=False) - except Exception: - demo.queue().launch(debug=False, share=True) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ - - -.. parsed-literal:: - - Running on local URL: http://127.0.0.1:7860 - - To create a public link, set `share=True` in `launch()`. - - - - - - - diff --git a/docs/notebooks/efficient-sam-with-output.rst b/docs/notebooks/efficient-sam-with-output.rst deleted file mode 100644 index fd2a9fb5a6154a..00000000000000 --- a/docs/notebooks/efficient-sam-with-output.rst +++ /dev/null @@ -1,1343 +0,0 @@ -Object segmentations with EfficientSAM and OpenVINO -=================================================== - -`Segment Anything Model (SAM) `__ has -emerged as a powerful tool for numerous vision applications. A key -component that drives the impressive performance for zero-shot transfer -and high versatility is a super large Transformer model trained on the -extensive high-quality SA-1B dataset. While beneficial, the huge -computation cost of SAM model has limited its applications to wider -real-world applications. To address this limitation, EfficientSAMs, -light-weight SAM models that exhibit decent performance with largely -reduced complexity, were proposed. The idea behind EfficientSAM is based -on leveraging masked image pretraining, SAMI, which learns to -reconstruct features from SAM image encoder for effective visual -representation learning. - -.. figure:: https://yformer.github.io/efficient-sam/EfficientSAM_files/overview.png - :alt: overview.png - - overview.png - -More details about model can be found in -`paper `__, `model web -page `__ and `original -repository `__ - -In this tutorial we consider how to convert and run EfficientSAM using -OpenVINO. We also demonstrate how to quantize model using -`NNCF `__ - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Load PyTorch model <#load-pytorch-model>`__ -- `Run PyTorch model inference <#run-pytorch-model-inference>`__ - - - `Prepare input data <#prepare-input-data>`__ - - `Define helpers for input and output - processing <#define-helpers-for-input-and-output-processing>`__ - -- `Convert model to OpenVINO IR - format <#convert-model-to-openvino-ir-format>`__ -- `Run OpenVINO model inference <#run-openvino-model-inference>`__ - - - `Select inference device from dropdown - list <#select-inference-device-from-dropdown-list>`__ - - `Compile OpenVINO model <#compile-openvino-model>`__ - - `Inference and visualize - result <#inference-and-visualize-result>`__ - -- `Quantization <#quantization>`__ - - - `Prepare calibration datasets <#prepare-calibration-datasets>`__ - - `Run Model Quantization <#run-model-quantization>`__ - -- `Verify quantized model - inference <#verify-quantized-model-inference>`__ - - - `Save quantize model on disk <#save-quantize-model-on-disk>`__ - - `Compare quantized model size <#compare-quantized-model-size>`__ - - `Compare inference time of the FP16 and INT8 - models <#compare-inference-time-of-the-fp16-and-int8-models>`__ - -- `Interactive segmentation demo <#interactive-segmentation-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import platform - - %pip install -q "openvino>=2024.5.0" "nncf>=2.14.0" - %pip install -q "torch>=2.2.0" "torchaudio>=2.2.0" "torchvision>=0.17.0" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q opencv-python "gradio>=4.13" "matplotlib>=3.4" tqdm - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - - -.. parsed-literal:: - - ERROR: Could not find a version that satisfies the requirement openvino>=2024.5.0 (from versions: 2021.3.0, 2021.4.0, 2021.4.1, 2021.4.2, 2022.1.0, 2022.2.0, 2022.3.0, 2022.3.1, 2022.3.2, 2023.0.0.dev20230119, 2023.0.0.dev20230217, 2023.0.0.dev20230407, 2023.0.0.dev20230427, 2023.0.0, 2023.0.1, 2023.0.2, 2023.1.0.dev20230623, 2023.1.0.dev20230728, 2023.1.0.dev20230811, 2023.1.0, 2023.2.0.dev20230922, 2023.2.0, 2023.3.0, 2024.0.0, 2024.1.0, 2024.2.0, 2024.3.0, 2024.4.0, 2024.4.1.dev20240926) - ERROR: No matching distribution found for openvino>=2024.5.0 - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - -.. code:: ipython3 - - from cmd_helper import clone_repo - - - repo_dir = clone_repo("https://github.com/yformer/EfficientSAM.git") - - %cd $repo_dir - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import download_file, device_widget, quantization_widget # noqa: F401 - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("efficient-sam.ipynb") - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM - - -Load PyTorch model ------------------- - - - -There are several models available in the repository: - -- **efficient-sam-vitt** - EfficientSAM with Vision Transformer Tiny - (VIT-T) as image encoder. The smallest and fastest model from - EfficientSAM family. -- **efficient-sam-vits** - EfficientSAM with Vision Transformer Small - (VIT-S) as image encoder. Heavier than efficient-sam-vitt, but more - accurate model. - -EfficientSAM provides a unified interface for interaction with models. -It means that all provided steps in the notebook for conversion and -running the model will be the same for all models. Below, you can select -one of them as example. - -.. code:: ipython3 - - from efficient_sam.build_efficient_sam import ( - build_efficient_sam_vitt, - build_efficient_sam_vits, - ) - import zipfile - - MODELS_LIST = { - "efficient-sam-vitt": build_efficient_sam_vitt, - "efficient-sam-vits": build_efficient_sam_vits, - } - - # Since EfficientSAM-S checkpoint file is >100MB, we store the zip file. - with zipfile.ZipFile("weights/efficient_sam_vits.pt.zip", "r") as zip_ref: - zip_ref.extractall("weights") - -Select one from supported models: - -.. code:: ipython3 - - import ipywidgets as widgets - - model_ids = list(MODELS_LIST) - - model_id = widgets.Dropdown( - options=model_ids, - value=model_ids[0], - description="Model:", - disabled=False, - ) - - model_id - - - - -.. parsed-literal:: - - Dropdown(description='Model:', options=('efficient-sam-vitt', 'efficient-sam-vits'), value='efficient-sam-vitt… - - - -build PyTorch model - -.. code:: ipython3 - - pt_model = MODELS_LIST[model_id.value]() - - pt_model.eval(); - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:303: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. - state_dict = torch.load(f, map_location="cpu") - - -Run PyTorch model inference ---------------------------- - -Now, when we selected and -loaded PyTorch model, we can check its result - -Prepare input data -~~~~~~~~~~~~~~~~~~ - - - -First of all, we should prepare input data for model. Model has 3 -inputs: \* image tensor - tensor with normalized input image. \* input -points - tensor with user provided points. It maybe just some specific -points on the image (e.g. provided by user clicks on the screen) or -bounding box coordinates in format left-top angle point and right-bottom -angle pint. \* input labels - tensor with definition of point type for -each provided point, 1 - for regular point, 2 - left-top point of -bounding box, 3 - right-bottom point of bounding box. - -.. code:: ipython3 - - from PIL import Image - - image_path = "figs/examples/dogs.jpg" - - image = Image.open(image_path) - image - - - - -.. image:: efficient-sam-with-output_files/efficient-sam-with-output_12_0.png - - - -Define helpers for input and output processing -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The code below defines helpers for preparing model input and postprocess -inference results. The input format is accepted by the model described -above. The model predicts mask logits for each pixel on the image and -intersection over union score for each area, how close it is to provided -points. We also provided some helper function for results visualization. - -.. code:: ipython3 - - import torch - import matplotlib.pyplot as plt - import numpy as np - - - def prepare_input(input_image, points, labels, torch_tensor=True): - img_tensor = np.ascontiguousarray(input_image)[None, ...].astype(np.float32) / 255 - img_tensor = np.transpose(img_tensor, (0, 3, 1, 2)) - pts_sampled = np.reshape(np.ascontiguousarray(points), [1, 1, -1, 2]) - pts_labels = np.reshape(np.ascontiguousarray(labels), [1, 1, -1]) - if torch_tensor: - img_tensor = torch.from_numpy(img_tensor) - pts_sampled = torch.from_numpy(pts_sampled) - pts_labels = torch.from_numpy(pts_labels) - return img_tensor, pts_sampled, pts_labels - - - def postprocess_results(predicted_iou, predicted_logits): - sorted_ids = np.argsort(-predicted_iou, axis=-1) - predicted_iou = np.take_along_axis(predicted_iou, sorted_ids, axis=2) - predicted_logits = np.take_along_axis(predicted_logits, sorted_ids[..., None, None], axis=2) - - return predicted_logits[0, 0, 0, :, :] >= 0 - - - def show_points(coords, labels, ax, marker_size=375): - pos_points = coords[labels == 1] - neg_points = coords[labels == 0] - ax.scatter( - pos_points[:, 0], - pos_points[:, 1], - color="green", - marker="*", - s=marker_size, - edgecolor="white", - linewidth=1.25, - ) - ax.scatter( - neg_points[:, 0], - neg_points[:, 1], - color="red", - marker="*", - s=marker_size, - edgecolor="white", - linewidth=1.25, - ) - - - def show_box(box, ax): - x0, y0 = box[0], box[1] - w, h = box[2] - box[0], box[3] - box[1] - ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor="yellow", facecolor=(0, 0, 0, 0), lw=5)) - - - def show_anns(mask, ax): - ax.set_autoscale_on(False) - img = np.ones((mask.shape[0], mask.shape[1], 4)) - img[:, :, 3] = 0 - # for ann in mask: - # m = ann - color_mask = np.concatenate([np.random.random(3), [0.5]]) - img[mask] = color_mask - ax.imshow(img) - -The complete model inference example demonstrated below - -.. code:: ipython3 - - input_points = [[580, 350], [650, 350]] - input_labels = [1, 1] - - example_input = prepare_input(image, input_points, input_labels) - - predicted_logits, predicted_iou = pt_model(*example_input) - - predicted_mask = postprocess_results(predicted_iou.detach().numpy(), predicted_logits.detach().numpy()) - -.. code:: ipython3 - - image = Image.open(image_path) - - plt.figure(figsize=(20, 20)) - plt.axis("off") - plt.imshow(image) - show_points(np.array(input_points), np.array(input_labels), plt.gca()) - plt.figure(figsize=(20, 20)) - plt.axis("off") - plt.imshow(image) - show_anns(predicted_mask, plt.gca()) - plt.title(f"PyTorch {model_id.value}", fontsize=18) - plt.show() - - - -.. image:: efficient-sam-with-output_files/efficient-sam-with-output_17_0.png - - - -.. image:: efficient-sam-with-output_files/efficient-sam-with-output_17_1.png - - -Convert model to OpenVINO IR format ------------------------------------ - - - -OpenVINO supports PyTorch models via conversion in Intermediate -Representation (IR) format using OpenVINO `Model Conversion -API `__. -``openvino.convert_model`` function accepts instance of PyTorch model -and example input (that helps in correct model operation tracing and -shape inference) and returns ``openvino.Model`` object that represents -model in OpenVINO framework. This ``openvino.Model`` is ready for -loading on the device using ``ov.Core.compile_model`` or can be saved on -disk using ``openvino.save_model``. - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - - ov_model_path = Path(f"{model_id.value}.xml") - - if not ov_model_path.exists(): - ov_model = ov.convert_model(pt_model, example_input=example_input) - ov.save_model(ov_model, ov_model_path) - else: - ov_model = core.read_model(ov_model_path) - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:220: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if ( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:241: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert ( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:163: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - size = int(math.sqrt(xy_num)) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert size * size == xy_num - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:166: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if size != h or size != w: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:251: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert x.shape[2] == num_patches - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:85: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if num_pts > self.decoder_max_num_input_points: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:92: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - elif num_pts < self.decoder_max_num_input_points: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:126: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if output_w > 0 and output_h > 0: - - -Run OpenVINO model inference ----------------------------- - - - -Select inference device from dropdown list -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Compile OpenVINO model -~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - compiled_model = core.compile_model(ov_model, device.value) - -Inference and visualize result -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Now, we can take a look on OpenVINO model prediction - -.. code:: ipython3 - - example_input = prepare_input(image, input_points, input_labels, torch_tensor=False) - result = compiled_model(example_input) - - predicted_logits, predicted_iou = result[0], result[1] - - predicted_mask = postprocess_results(predicted_iou, predicted_logits) - - plt.figure(figsize=(20, 20)) - plt.axis("off") - plt.imshow(image) - show_points(np.array(input_points), np.array(input_labels), plt.gca()) - plt.figure(figsize=(20, 20)) - plt.axis("off") - plt.imshow(image) - show_anns(predicted_mask, plt.gca()) - plt.title(f"OpenVINO {model_id.value}", fontsize=18) - plt.show() - - - -.. image:: efficient-sam-with-output_files/efficient-sam-with-output_25_0.png - - - -.. image:: efficient-sam-with-output_files/efficient-sam-with-output_25_1.png - - -Quantization ------------- - - - -`NNCF `__ enables -post-training quantization by adding the quantization layers into the -model graph and then using a subset of the training dataset to -initialize the parameters of these additional quantization layers. The -framework is designed so that modifications to your original training -code are minor. - -The optimization process contains the following steps: - -1. Create a calibration dataset for quantization. -2. Run ``nncf.quantize`` to obtain quantized encoder and decoder models. -3. Serialize the ``INT8`` model using ``openvino.save_model`` function. - -.. - - **Note**: Quantization is time and memory consuming operation. - Running quantization code below may take some time. - -Please select below whether you would like to run EfficientSAM -quantization. - -.. code:: ipython3 - - to_quantize = quantization_widget() - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - import requests - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - %load_ext skip_kernel_extension - -Prepare calibration datasets -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The first step is to prepare calibration datasets for quantization. We -will use coco128 dataset for quantization. Usually, this dataset used -for solving object detection task and its annotation provides box -coordinates for images. In our case, box coordinates will serve as input -points for object segmentation, the code below downloads dataset and -creates DataLoader for preparing inputs for EfficientSAM model. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from zipfile import ZipFile - - DATA_URL = "https://ultralytics.com/assets/coco128.zip" - OUT_DIR = Path('.') - - download_file(DATA_URL, directory=OUT_DIR, show_progress=True) - - if not (OUT_DIR / "coco128/images/train2017").exists(): - with ZipFile('coco128.zip' , "r") as zip_ref: - zip_ref.extractall(OUT_DIR) - - - -.. parsed-literal:: - - coco128.zip: 0%| | 0.00/6.66M [00:00 - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 6 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 6 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'batched_images'!. This input will be filled with random values! - [ WARNING ] No input files were given for input 'batched_points'!. This input will be filled with random values! - [ WARNING ] No input files were given for input 'batched_point_labels'!. This input will be filled with random values! - [ INFO ] Fill input 'batched_images' with random values - [ INFO ] Fill input 'batched_points' with random values - [ INFO ] Fill input 'batched_point_labels' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 6 inference requests, limits: 15000 ms duration) - [ INFO ] Benchmarking in full mode (inputs filling are included in measurement loop). - [ INFO ] First inference took 795.15 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 53 iterations - [ INFO ] Duration: 16627.52 ms - [ INFO ] Latency: - [ INFO ] Median: 1878.09 ms - [ INFO ] Average: 1853.13 ms - [ INFO ] Min: 1337.08 ms - [ INFO ] Max: 2026.48 ms - [ INFO ] Throughput: 3.19 FPS - - -.. code:: ipython3 - - if to_quantize.value: - !benchmark_app -m $quantized_model_path -d $device.value -data_shape "batched_images[1,3,512,512],batched_points[1,1,2,2],batched_point_labels[1,1,2]" -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 44.76 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] batched_images (node: batched_images) : f32 / [...] / [?,?,?,?] - [ INFO ] batched_points (node: batched_points) : i64 / [...] / [?,?,?,?] - [ INFO ] batched_point_labels (node: batched_point_labels) : i64 / [...] / [?,?,?] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: aten::reshape/Reshape_3) : f32 / [...] / [?,?,?,?,?] - [ INFO ] ***NO_NAME*** (node: aten::reshape/Reshape_2) : f32 / [...] / [?,?,?] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] batched_images (node: batched_images) : f32 / [...] / [?,?,?,?] - [ INFO ] batched_points (node: batched_points) : i64 / [...] / [?,?,?,?] - [ INFO ] batched_point_labels (node: batched_point_labels) : i64 / [...] / [?,?,?] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: aten::reshape/Reshape_3) : f32 / [...] / [?,?,?,?,?] - [ INFO ] ***NO_NAME*** (node: aten::reshape/Reshape_2) : f32 / [...] / [?,?,?] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 1630.00 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 6 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 24 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 6 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 6 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'batched_images'!. This input will be filled with random values! - [ WARNING ] No input files were given for input 'batched_points'!. This input will be filled with random values! - [ WARNING ] No input files were given for input 'batched_point_labels'!. This input will be filled with random values! - [ INFO ] Fill input 'batched_images' with random values - [ INFO ] Fill input 'batched_points' with random values - [ INFO ] Fill input 'batched_point_labels' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 6 inference requests, limits: 15000 ms duration) - [ INFO ] Benchmarking in full mode (inputs filling are included in measurement loop). - [ INFO ] First inference took 574.01 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 55 iterations - [ INFO ] Duration: 15877.41 ms - [ INFO ] Latency: - [ INFO ] Median: 1710.77 ms - [ INFO ] Average: 1697.13 ms - [ INFO ] Min: 583.87 ms - [ INFO ] Max: 1826.89 ms - [ INFO ] Throughput: 3.46 FPS - - -Interactive segmentation demo ------------------------------ - - - -.. code:: ipython3 - - import numpy as np - from PIL import Image - import cv2 - import matplotlib.pyplot as plt - - - def sigmoid(x): - return 1 / (1 + np.exp(-x)) - - - def format_results(masks, scores, logits, filter=0): - annotations = [] - n = len(scores) - for i in range(n): - annotation = {} - - mask = masks[i] - tmp = np.where(mask != 0) - if np.sum(mask) < filter: - continue - annotation["id"] = i - annotation["segmentation"] = mask - annotation["bbox"] = [ - np.min(tmp[0]), - np.min(tmp[1]), - np.max(tmp[1]), - np.max(tmp[0]), - ] - annotation["score"] = scores[i] - annotation["area"] = annotation["segmentation"].sum() - annotations.append(annotation) - return annotations - - - def point_prompt(masks, points, point_label, target_height, target_width): # numpy - h = masks[0]["segmentation"].shape[0] - w = masks[0]["segmentation"].shape[1] - if h != target_height or w != target_width: - points = [[int(point[0] * w / target_width), int(point[1] * h / target_height)] for point in points] - onemask = np.zeros((h, w)) - for i, annotation in enumerate(masks): - if isinstance(annotation, dict): - mask = annotation["segmentation"] - else: - mask = annotation - for i, point in enumerate(points): - if point[1] < mask.shape[0] and point[0] < mask.shape[1]: - if mask[point[1], point[0]] == 1 and point_label[i] == 1: - onemask += mask - if mask[point[1], point[0]] == 1 and point_label[i] == 0: - onemask -= mask - onemask = onemask >= 1 - return onemask, 0 - - - def show_mask( - annotation, - ax, - random_color=False, - bbox=None, - retinamask=True, - target_height=960, - target_width=960, - ): - mask_sum = annotation.shape[0] - height = annotation.shape[1] - weight = annotation.shape[2] - # annotation is sorted by area - areas = np.sum(annotation, axis=(1, 2)) - sorted_indices = np.argsort(areas)[::1] - annotation = annotation[sorted_indices] - - index = (annotation != 0).argmax(axis=0) - if random_color: - color = np.random.random((mask_sum, 1, 1, 3)) - else: - color = np.ones((mask_sum, 1, 1, 3)) * np.array([30 / 255, 144 / 255, 255 / 255]) - transparency = np.ones((mask_sum, 1, 1, 1)) * 0.6 - visual = np.concatenate([color, transparency], axis=-1) - mask_image = np.expand_dims(annotation, -1) * visual - - mask = np.zeros((height, weight, 4)) - - h_indices, w_indices = np.meshgrid(np.arange(height), np.arange(weight), indexing="ij") - indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None)) - - mask[h_indices, w_indices, :] = mask_image[indices] - if bbox is not None: - x1, y1, x2, y2 = bbox - ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor="b", linewidth=1)) - - if not retinamask: - mask = cv2.resize(mask, (target_width, target_height), interpolation=cv2.INTER_NEAREST) - - return mask - - - def process( - annotations, - image, - scale, - better_quality=False, - mask_random_color=True, - bbox=None, - points=None, - use_retina=True, - withContours=True, - ): - if isinstance(annotations[0], dict): - annotations = [annotation["segmentation"] for annotation in annotations] - - original_h = image.height - original_w = image.width - if better_quality: - if isinstance(annotations[0], torch.Tensor): - annotations = np.array(annotations) - for i, mask in enumerate(annotations): - mask = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8)) - annotations[i] = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_OPEN, np.ones((8, 8), np.uint8)) - annotations = np.array(annotations) - inner_mask = show_mask( - annotations, - plt.gca(), - random_color=mask_random_color, - bbox=bbox, - retinamask=use_retina, - target_height=original_h, - target_width=original_w, - ) - - if isinstance(annotations, torch.Tensor): - annotations = annotations.cpu().numpy() - - if withContours: - contour_all = [] - temp = np.zeros((original_h, original_w, 1)) - for i, mask in enumerate(annotations): - if isinstance(mask, dict): - mask = mask["segmentation"] - annotation = mask.astype(np.uint8) - if not use_retina: - annotation = cv2.resize( - annotation, - (original_w, original_h), - interpolation=cv2.INTER_NEAREST, - ) - contours, _ = cv2.findContours(annotation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - for contour in contours: - contour_all.append(contour) - cv2.drawContours(temp, contour_all, -1, (255, 255, 255), 2 // scale) - color = np.array([0 / 255, 0 / 255, 255 / 255, 0.9]) - contour_mask = temp / 255 * color.reshape(1, 1, -1) - - image = image.convert("RGBA") - overlay_inner = Image.fromarray((inner_mask * 255).astype(np.uint8), "RGBA") - image.paste(overlay_inner, (0, 0), overlay_inner) - - if withContours: - overlay_contour = Image.fromarray((contour_mask * 255).astype(np.uint8), "RGBA") - image.paste(overlay_contour, (0, 0), overlay_contour) - - return image - - - def segment_with_boxs( - image, - seg_image, - global_points, - global_point_label, - input_size=1024, - better_quality=False, - withContours=True, - use_retina=True, - mask_random_color=True, - ): - if global_points is None or len(global_points) < 2 or global_points[0] is None: - return image, global_points, global_point_label - - input_size = int(input_size) - w, h = image.size - scale = input_size / max(w, h) - new_w = int(w * scale) - new_h = int(h * scale) - image = image.resize((new_w, new_h)) - - scaled_points = np.array([[int(x * scale) for x in point] for point in global_points]) - scaled_points = scaled_points[:2] - scaled_point_label = np.array(global_point_label)[:2] - - if scaled_points.size == 0 and scaled_point_label.size == 0: - return image, global_points, global_point_label - - nd_image = np.array(image) - img_tensor = nd_image.astype(np.float32) / 255 - img_tensor = np.transpose(img_tensor, (2, 0, 1)) - - pts_sampled = np.reshape(scaled_points, [1, 1, -1, 2]) - pts_sampled = pts_sampled[:, :, :2, :] - pts_labels = np.reshape(np.array([2, 3]), [1, 1, 2]) - - results = compiled_model([img_tensor[None, ...], pts_sampled, pts_labels]) - predicted_logits = results[0] - predicted_iou = results[1] - all_masks = sigmoid(predicted_logits[0, 0, :, :, :]) >= 0.5 - predicted_iou = predicted_iou[0, 0, ...] - - max_predicted_iou = -1 - selected_mask_using_predicted_iou = None - selected_predicted_iou = None - - for m in range(all_masks.shape[0]): - curr_predicted_iou = predicted_iou[m] - if curr_predicted_iou > max_predicted_iou or selected_mask_using_predicted_iou is None: - max_predicted_iou = curr_predicted_iou - selected_mask_using_predicted_iou = all_masks[m : m + 1] - selected_predicted_iou = predicted_iou[m : m + 1] - - results = format_results(selected_mask_using_predicted_iou, selected_predicted_iou, predicted_logits, 0) - - annotations = results[0]["segmentation"] - annotations = np.array([annotations]) - fig = process( - annotations=annotations, - image=image, - scale=(1024 // input_size), - better_quality=better_quality, - mask_random_color=mask_random_color, - use_retina=use_retina, - bbox=scaled_points.reshape([4]), - withContours=withContours, - ) - - global_points = [] - global_point_label = [] - return fig, global_points, global_point_label - - - def segment_with_points( - image, - global_points, - global_point_label, - input_size=1024, - better_quality=False, - withContours=True, - use_retina=True, - mask_random_color=True, - ): - input_size = int(input_size) - w, h = image.size - scale = input_size / max(w, h) - new_w = int(w * scale) - new_h = int(h * scale) - image = image.resize((new_w, new_h)) - - if global_points is None or len(global_points) < 1 or global_points[0] is None: - return image, global_points, global_point_label - scaled_points = np.array([[int(x * scale) for x in point] for point in global_points]) - scaled_point_label = np.array(global_point_label) - - if scaled_points.size == 0 and scaled_point_label.size == 0: - return image, global_points, global_point_label - - nd_image = np.array(image) - img_tensor = (nd_image).astype(np.float32) / 255 - img_tensor = np.transpose(img_tensor, (2, 0, 1)) - - pts_sampled = np.reshape(scaled_points, [1, 1, -1, 2]) - pts_labels = np.reshape(np.array(global_point_label), [1, 1, -1]) - - results = compiled_model([img_tensor[None, ...], pts_sampled, pts_labels]) - predicted_logits = results[0] - predicted_iou = results[1] - all_masks = sigmoid(predicted_logits[0, 0, :, :, :]) >= 0.5 - predicted_iou = predicted_iou[0, 0, ...] - - results = format_results(all_masks, predicted_iou, predicted_logits, 0) - annotations, _ = point_prompt(results, scaled_points, scaled_point_label, new_h, new_w) - annotations = np.array([annotations]) - - fig = process( - annotations=annotations, - image=image, - scale=(1024 // input_size), - better_quality=better_quality, - mask_random_color=mask_random_color, - points=scaled_points, - bbox=None, - use_retina=use_retina, - withContours=withContours, - ) - - global_points = [] - global_point_label = [] - # return fig, None - return fig, global_points, global_point_label - -.. code:: ipython3 - - # Go back to the efficient-sam notebook directory - %cd .. - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/efficient-sam/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(segment_with_point_fn=segment_with_points, segment_with_box_fn=segment_with_boxs) - - try: - demo.queue().launch(debug=False) - except Exception: - demo.queue().launch(share=True, debug=False) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam - Running on local URL: http://127.0.0.1:7860 - - To create a public link, set `share=True` in `launch()`. - - - - - - - diff --git a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_12_0.jpg b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_12_0.jpg deleted file mode 100644 index 15a4586b4a15c7..00000000000000 --- a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_12_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:75a4e5c82ce450c769c17c13b92e06be35349f37b7544c03ef399446179b3c47 -size 50984 diff --git a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_12_0.png b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_12_0.png deleted file mode 100644 index a3362620caebb6..00000000000000 --- a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_12_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:97f75e509a7fe580c1e3d082c255a76f9b353b88e16b0ffed0e339e1acfa2afe -size 862579 diff --git a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_17_0.png b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_17_0.png deleted file mode 100644 index 444d465aaefcdf..00000000000000 --- a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_17_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:de9de00a469c6064b84bd3a5becb0a265eaaee5fa99441e6924b6064bfdc11ae -size 1330911 diff --git a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_17_1.png b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_17_1.png deleted file mode 100644 index ba5079eaf1f8d0..00000000000000 --- a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_17_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a2dec3222a316092f851094cc92384c581ff96812ec0ede5d3d1252fe9043766 -size 1260413 diff --git a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_25_0.png b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_25_0.png deleted file mode 100644 index 444d465aaefcdf..00000000000000 --- a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_25_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:de9de00a469c6064b84bd3a5becb0a265eaaee5fa99441e6924b6064bfdc11ae -size 1330911 diff --git a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_25_1.png b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_25_1.png deleted file mode 100644 index dfa9fe77bb8f6c..00000000000000 --- a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_25_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9142ed1dea8d5db109cd7dd833ef7f9d978e12b1e9e1a8a7965290da8bb558a7 -size 1261076 diff --git a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_36_0.png b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_36_0.png deleted file mode 100644 index 444d465aaefcdf..00000000000000 --- a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_36_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:de9de00a469c6064b84bd3a5becb0a265eaaee5fa99441e6924b6064bfdc11ae -size 1330911 diff --git a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_36_1.png b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_36_1.png deleted file mode 100644 index 0479fcd2653cab..00000000000000 --- a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_36_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:224715ab4b66cd6a21b26c8ed82ecac231096bd71ec9ebf9ceb0e21bb833ea9b -size 1261787 diff --git a/docs/notebooks/encodec-audio-compression-with-output.rst b/docs/notebooks/encodec-audio-compression-with-output.rst deleted file mode 100644 index bc1564cf9084b2..00000000000000 --- a/docs/notebooks/encodec-audio-compression-with-output.rst +++ /dev/null @@ -1,637 +0,0 @@ -Audio compression with EnCodec and OpenVINO -=========================================== - -Compression is an important part of the Internet today because it -enables people to easily share high-quality photos, listen to audio -messages, stream their favorite shows, and so much more. Even when using -today’s state-of-the-art techniques, enjoying these rich multimedia -experiences requires a high speed Internet connection and plenty of -storage space. AI helps to overcome these limitations: “Imagine -listening to a friend’s audio message in an area with low connectivity -and not having it stall or glitch.” - -This tutorial considers ways to use OpenVINO and EnCodec algorithm for -hyper compression of audio. EnCodec is a real-time, high-fidelity audio -codec that uses AI to compress audio files without losing quality. It -was introduced in `High Fidelity Neural Audio -Compression `__ paper by Meta AI. -The researchers claimed they achieved an approximate 10x compression -rate without loss of quality and made it work for CD-quality audio. More -details about this approach can be found in `Meta AI -blog `__ -and original `repo `__. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/17546d66-12b9-4841-9293-cc878258a186 - :alt: image.png - - image.png - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Instantiate audio compression - pipeline <#instantiate-audio-compression-pipeline>`__ -- `Explore EnCodec pipeline <#explore-encodec-pipeline>`__ - - - `Preprocessing <#preprocessing>`__ - - `Encoding <#encoding>`__ - - `Decompression <#decompression>`__ - -- `Convert model to OpenVINO Intermediate Representation - format <#convert-model-to-openvino-intermediate-representation-format>`__ -- `Integrate OpenVINO to EnCodec - pipeline <#integrate-openvino-to-encodec-pipeline>`__ - - - `Select inference device <#select-inference-device>`__ - -- `Run EnCodec with OpenVINO <#run-encodec-with-openvino>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Install required dependencies: - -.. code:: ipython3 - - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "openvino>=2023.3.0" "torch>=2.1" "torchaudio>=2.1" "encodec>=0.1.1" "gradio>=4.19" "librosa>=0.8.1" "matplotlib>=3.4" tqdm - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - - -Instantiate audio compression pipeline --------------------------------------- - - - -`Codecs `__, which act as encoders -and decoders for streams of data, help empower most of the audio -compression people currently use online. Some examples of commonly used -codecs include MP3, Opus, and EVS. Classic codecs like these decompose -the signal between different frequencies and encode as efficiently as -possible. Most classic codecs leverage human hearing knowledge -(psychoacoustics) but have a finite or given set of handcrafted ways to -efficiently encode and decode the file. EnCodec, a neural network that -is trained from end to end to reconstruct the input signal, was -introduced as an attempt to overcome this limitation. It consists of -three parts: - -- The **encoder**, which takes the uncompressed data in and transforms - it into a higher dimensional and lower frame rate representation. - -- The **quantizer**, which compresses this representation to the target - size. This compressed representation is what is stored on disk or - will be sent through the network. - -- The **decoder** is the final step. It turns the compressed signal - back into a waveform that is as similar as possible to the original. - The key to lossless compression is to identify changes that will not - be perceivable by humans, as perfect reconstruction is impossible at - low bit rates. - -|encodec_compression|) - -The authors provide two multi-bandwidth models: \* -``encodec_model_24khz`` - a causal model operating at 24 kHz on -monophonic audio trained on a variety of audio data. \* -``encodec_model_48khz`` - a non-causal model operating at 48 kHz on -stereophonic audio trained on music-only data. - -In this tutorial, we will use ``encodec_model_24khz`` as an example, but -the same actions are also applicable to ``encodec_model_48khz`` model as -well. To start working with this model, we need to instantiate model -class using ``EncodecModel.encodec_model_24khz()`` and select required -compression bandwidth among available: 1.5, 3, 6, 12 or 24 kbps for 24 -kHz model and 3, 6, 12 and 24 kbps for 48 kHz model. We will use 6 kbps -bandwidth. - -.. |encodec_compression| image:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/5cd9a482-b42b-4dea-85a5-6d66b20ce13d - -.. code:: ipython3 - - from encodec import compress, decompress - from encodec.utils import convert_audio, save_audio - from encodec.compress import MODELS - import torchaudio - import torch - import typing as tp - - # Fetch `notebook_utils` module - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("encodec-audio-compression.ipynb") - - model_id = "encodec_24khz" - # Instantiate a pretrained EnCodec model - model = MODELS[model_id]() - model.set_target_bandwidth(6.0) - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. - WeightNorm.apply(module, name, dim) - - -Explore EnCodec pipeline ------------------------- - - - -Let us explore model capabilities on example audio: - -.. code:: ipython3 - - import librosa - import matplotlib.pyplot as plt - import librosa.display - import IPython.display as ipd - - from notebook_utils import download_file, device_widget - - test_data_url = "https://github.com/facebookresearch/encodec/raw/main/test_24k.wav" - - sample_file = "test_24k.wav" - if not Path(sample_file).exists(): - download_file(test_data_url, sample_file) - audio, sr = librosa.load(sample_file) - plt.figure(figsize=(14, 5)) - librosa.display.waveshow(audio, sr=sr) - - ipd.Audio(sample_file) - - - -.. parsed-literal:: - - test_24k.wav: 0%| | 0.00/938k [00:00 - - Your browser does not support the audio element. - - - - - - -.. image:: encodec-audio-compression-with-output_files/encodec-audio-compression-with-output_6_2.png - - -Preprocessing -~~~~~~~~~~~~~ - - - -To achieve the best result, audio should have the number of channels and -sample rate expected by the model. If audio does not fulfill these -requirements, it can be converted to the desired sample rate and the -number of channels using the ``convert_audio`` function. - -.. code:: ipython3 - - model_sr, model_channels = model.sample_rate, model.channels - print(f"Model expected sample rate {model_sr}") - print(f"Model expected audio format {'mono' if model_channels == 1 else 'stereo'}") - - -.. parsed-literal:: - - Model expected sample rate 24000 - Model expected audio format mono - - -.. code:: ipython3 - - # Load and pre-process the audio waveform - wav, sr = torchaudio.load(sample_file) - - wav = convert_audio(wav, sr, model_sr, model_channels) - -Encoding -~~~~~~~~ - - - -Audio waveform should be split by chunks and then encoded by Encoder -model, then compressed by quantizer for reducing memory. The result of -compression is a binary file with ``ecdc`` extension, a special format -for storing EnCodec compressed audio on disc. - -.. code:: ipython3 - - from pathlib import Path - - - out_file = Path("compressed.ecdc") - b = compress(model, wav) - out_file.write_bytes(b) - - - - -.. parsed-literal:: - - 15067 - - - -Let us compare obtained compression result: - -.. code:: ipython3 - - import os - - orig_file_stats = os.stat(sample_file) - compressed_file_stats = os.stat("compressed.ecdc") - print(f"size before compression in Bytes: {orig_file_stats.st_size}") - print(f"size after compression in Bytes: {compressed_file_stats.st_size}") - print(f"Compression file size ratio: {orig_file_stats.st_size / compressed_file_stats.st_size:.2f}") - - -.. parsed-literal:: - - size before compression in Bytes: 960078 - size after compression in Bytes: 15067 - Compression file size ratio: 63.72 - - -Great! Now, we see the power of hyper compression. Binary size of a file -becomes 60 times smaller and more suitable for sending via network. - -Decompression -~~~~~~~~~~~~~ - - - -After successful sending of the compressed audio, it should be -decompressed on the recipient’s side. The decoder model is responsible -for restoring the compressed signal back into a waveform that is as -similar as possible to the original. - -.. code:: ipython3 - - out, out_sr = decompress(out_file.read_bytes()) - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. - WeightNorm.apply(module, name, dim) - - -.. code:: ipython3 - - output_file = "decopressed.wav" - save_audio(out, output_file, out_sr) - -The decompressed audio will be saved to the ``decompressed.wav`` file -when decompression is finished. We can compare result with the original -audio. - -.. code:: ipython3 - - audio, sr = librosa.load(output_file) - plt.figure(figsize=(14, 5)) - librosa.display.waveshow(audio, sr=sr) - - ipd.Audio(output_file) - - - - -.. raw:: html - - - - - - - - -.. image:: encodec-audio-compression-with-output_files/encodec-audio-compression-with-output_19_1.png - - -Nice! Audio sounds close to original. - -Convert model to OpenVINO Intermediate Representation format ------------------------------------------------------------- - - - -For best results with OpenVINO, it is recommended to convert the model -to OpenVINO IR format. OpenVINO supports PyTorch via conversion to -OpenVINO IR format. We need to provide initialized model’s instance and -example of inputs for shape inference. We will use ``ov.convert_model`` -functionality to convert the PyTorch models. The ``ov.convert_model`` -Python function returns an OpenVINO model ready to load on the device -and start making predictions. We can save it on disk for the next usage -with ``ov.save_model``. - -.. code:: ipython3 - - class FrameEncoder(torch.nn.Module): - def __init__(self, model): - super().__init__() - self.model = model - - def forward(self, x: torch.Tensor): - codes, scale = self.model._encode_frame(x) - if not self.model.normalize: - return codes - return codes, scale - -.. code:: ipython3 - - class FrameDecoder(torch.nn.Module): - def __init__(self, model): - super().__init__() - self.model = model - - def forward(self, codes, scale=None): - return model._decode_frame((codes, scale)) - -.. code:: ipython3 - - encoder = FrameEncoder(model) - decoder = FrameDecoder(model) - -.. code:: ipython3 - - import openvino as ov - - - core = ov.Core() - - OV_ENCODER_PATH = Path("encodec_encoder.xml") - if not OV_ENCODER_PATH.exists(): - encoder_ov = ov.convert_model(encoder, example_input=torch.zeros(1, 1, 480000), input=[[1, 1, 480000]]) - ov.save_model(encoder_ov, OV_ENCODER_PATH) - else: - encoder_ov = core.read_model(OV_ENCODER_PATH) - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:60: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:85: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:87: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - max_pad = max(padding_left, padding_right) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:89: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if length <= max_pad: - - -.. code:: ipython3 - - OV_DECODER_PATH = Path("encodec_decoder.xml") - if not OV_DECODER_PATH.exists(): - decoder_ov = ov.convert_model( - decoder, - example_input=torch.zeros([1, 8, 1500], dtype=torch.long), - input=[[1, 8, 1500]], - ) - ov.save_model(decoder_ov, OV_DECODER_PATH) - else: - decoder_ov = core.read_model(OV_DECODER_PATH) - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/quantization/core_vq.py:358: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. - quantized_out = torch.tensor(0.0, device=q_indices.device) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/quantization/core_vq.py:359: TracerWarning: Iterating over a tensor might cause the trace to be incorrect. Passing a tensor of different shape won't change the number of iterations executed (and might lead to errors or silently give incorrect results). - for i, indices in enumerate(q_indices): - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:103: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert (padding_left + padding_right) <= x.shape[-1] - - -Integrate OpenVINO to EnCodec pipeline --------------------------------------- - - - -The following steps are required for integration of OpenVINO to EnCodec -pipeline: - -1. Load the model to a device. -2. Define audio frame processing functions. -3. Replace the original frame processing functions with OpenVINO based - algorithms. - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - compiled_encoder = core.compile_model(encoder_ov, device.value) - encoder_out = compiled_encoder.output(0) - - compiled_decoder = core.compile_model(decoder_ov, device.value) - decoder_out = compiled_decoder.output(0) - -.. code:: ipython3 - - def encode_frame(x: torch.Tensor): - has_scale = len(compiled_encoder.outputs) == 2 - result = compiled_encoder(x) - codes = torch.from_numpy(result[encoder_out]) - if has_scale: - scale = torch.from_numpy(result[compiled_encoder.output(1)]) - else: - scale = None - return codes, scale - -.. code:: ipython3 - - EncodedFrame = tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]] - - - def decode_frame(encoded_frame: EncodedFrame): - codes, scale = encoded_frame - inputs = [codes] - if scale is not None: - inputs.append(scale) - return torch.from_numpy(compiled_decoder(inputs)[decoder_out]) - -.. code:: ipython3 - - model._encode_frame = encode_frame - model._decode_frame = decode_frame - - MODELS[model_id] = lambda: model - -Run EnCodec with OpenVINO -------------------------- - - - -The process of running encodec with OpenVINO under hood will be the same -like with the original PyTorch models. - -.. code:: ipython3 - - b = compress(model, wav, use_lm=False) - out_file = Path("compressed_ov.ecdc") - out_file.write_bytes(b) - - - - -.. parsed-literal:: - - 15067 - - - -.. code:: ipython3 - - out, out_sr = decompress(out_file.read_bytes()) - -.. code:: ipython3 - - ov_output_file = "decopressed_ov.wav" - save_audio(out, ov_output_file, out_sr) - -.. code:: ipython3 - - audio, sr = librosa.load(ov_output_file) - plt.figure(figsize=(14, 5)) - librosa.display.waveshow(audio, sr=sr) - - ipd.Audio(ov_output_file) - - - - -.. raw:: html - - - - - - - - -.. image:: encodec-audio-compression-with-output_files/encodec-audio-compression-with-output_38_1.png - - -.. code:: ipython3 - - import gradio as gr - from typing import Tuple - import numpy as np - - - def preprocess(input, sample_rate, model_sr, model_channels): - input = torch.tensor(input, dtype=torch.float32) - input = input / 2**15 # adjust to int16 scale - input = input.unsqueeze(0) - input = convert_audio(input, sample_rate, model_sr, model_channels) - - return input - - - def postprocess(output): - output = output.squeeze() - output = output * 2**15 # adjust to [-1, 1] scale - output = output.numpy(force=True) - output = output.astype(np.int16) - - return output - - - def _compress(input: Tuple[int, np.ndarray]): - sample_rate, waveform = input - waveform = preprocess(waveform, sample_rate, model_sr, model_channels) - - b = compress(model, waveform, use_lm=False) - out, out_sr = decompress(b) - - out = postprocess(out) - return out_sr, out - - - demo = gr.Interface(_compress, "audio", "audio", examples=["test_24k.wav"]) - - try: - demo.launch(debug=False) - except Exception: - demo.launch(share=True, debug=False) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ - - -.. parsed-literal:: - - Running on local URL: http://127.0.0.1:7860 - - To create a public link, set `share=True` in `launch()`. - - - - - - - diff --git a/docs/notebooks/encodec-audio-compression-with-output_files/encodec-audio-compression-with-output_19_1.png b/docs/notebooks/encodec-audio-compression-with-output_files/encodec-audio-compression-with-output_19_1.png deleted file mode 100644 index 9f01201bccd659..00000000000000 --- a/docs/notebooks/encodec-audio-compression-with-output_files/encodec-audio-compression-with-output_19_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a031358d39936f6ccdb1e4e8c9eb8ddda651384ecf7d95fbe6c2dc1f7e65be95 -size 44175 diff --git a/docs/notebooks/encodec-audio-compression-with-output_files/encodec-audio-compression-with-output_38_1.png b/docs/notebooks/encodec-audio-compression-with-output_files/encodec-audio-compression-with-output_38_1.png deleted file mode 100644 index d157f39a8fc143..00000000000000 --- a/docs/notebooks/encodec-audio-compression-with-output_files/encodec-audio-compression-with-output_38_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f2800c74996f567b92758358b136cc2acab70b48ea628ac392e59cecc1c416a3 -size 44186 diff --git a/docs/notebooks/encodec-audio-compression-with-output_files/encodec-audio-compression-with-output_6_2.png b/docs/notebooks/encodec-audio-compression-with-output_files/encodec-audio-compression-with-output_6_2.png deleted file mode 100644 index 93baa1aa5eeea6..00000000000000 --- a/docs/notebooks/encodec-audio-compression-with-output_files/encodec-audio-compression-with-output_6_2.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:491264f7b803244b0230b7a7bebee6b81da547541ccf928fbae1c9c0af719451 -size 44933 diff --git a/docs/notebooks/explainable-ai-1-basic-with-output.rst b/docs/notebooks/explainable-ai-1-basic-with-output.rst deleted file mode 100644 index 3025da7962cabb..00000000000000 --- a/docs/notebooks/explainable-ai-1-basic-with-output.rst +++ /dev/null @@ -1,300 +0,0 @@ -OpenVINO™ Explainable AI Toolkit (1/3): Basic -============================================= - -.. warning:: - - Important note: This notebook requires python >= 3.10. Please make - sure that your environment fulfill to this requirement before running - it - -This is the **first notebook** in series of exploring `OpenVINO™ -Explainable AI -(XAI) `__: - -1. `OpenVINO™ Explainable AI Toolkit (1/3): - Basic `__ -2. `OpenVINO™ Explainable AI Toolkit (2/3): Deep - Dive `__ -3. `OpenVINO™ Explainable AI Toolkit (3/3): Saliency map - interpretation `__ - -It covers the basic introduction to -`XAI `__ toolkit and -shows how to explain predictions of an image classification model. - -`OpenVINO™ Explainable AI -(XAI) `__ provides a -suite of XAI algorithms for the analysis of OpenVINO IR model -predictions. Generated saliency maps can be used to evaluate model -reasoning when making predictions. - -A **saliency map** is a visualization technique that highlights regions -of the interest in an image from the model perspective. For example, it -can be used to explain classification model predictions for a particular -label. Here is an example of a saliency map that you will get in this -notebook: - -.. image:: https://github.com/openvinotoolkit/openvino_xai/assets/17028475/ccb67c0b-c58e-4beb-889f-af0aff21cb66 - -A pre-trained `MobileNetV3 -model `__ -from `Open Model -Zoo `__ is used in -this tutorial. - - -**Table of contents:** - - -- `Imports <#imports>`__ -- `Download the Model and data - samples <#download-the-model-and-data-samples>`__ -- `Load the Model <#load-the-model>`__ -- `Select inference device <#select-inference-device>`__ -- `Load an Image <#load-an-image>`__ -- `Do Inference <#do-inference>`__ -- `Create Explainer <#create-explainer>`__ -- `Do Explanation <#do-explanation>`__ - - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - # Install openvino package - %pip install -q "openvino>=2024.2.0" opencv-python tqdm - - # Install openvino xai package - %pip install -q --no-deps "openvino-xai>=1.1.0" - %pip install -q -U "numpy==1.*" - %pip install -q scipy - - %pip install -q "matplotlib>=3.4" - -Imports -------- - - - -.. code:: ipython3 - - from pathlib import Path - - import cv2 - import matplotlib.pyplot as plt - import numpy as np - import openvino as ov - import openvino_xai as xai - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("explainable-ai-1-basic.ipynb") - -Download the Model and data samples ------------------------------------ - - - -.. code:: ipython3 - - base_artifacts_dir = Path("./artifacts").expanduser() - - model_name = "v3-small_224_1.0_float" - model_xml_name = f"{model_name}.xml" - model_bin_name = f"{model_name}.bin" - - model_xml_path = base_artifacts_dir / model_xml_name - - base_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/mobelinet-v3-tf/FP32/" - - if not model_xml_path.exists(): - download_file(base_url + model_xml_name, model_xml_name, base_artifacts_dir) - download_file(base_url + model_bin_name, model_bin_name, base_artifacts_dir) - else: - print(f"{model_name} already downloaded to {base_artifacts_dir}") - -Select inference device ------------------------ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device = device_widget() - device - -Load the Model --------------- - - - -.. code:: ipython3 - - core = ov.Core() - model = core.read_model(model=model_xml_path) - compiled_model = core.compile_model(model=model, device_name=device.value) - -Load an Image -------------- - - - -.. code:: ipython3 - - # Download the image from the openvino_notebooks storage - - if not Path("data/coco.jpg").exists(): - image_filename = download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco.jpg", - directory="data", - ) - - # The MobileNet model expects images in RGB format. - image = cv2.cvtColor(cv2.imread(filename=str(image_filename)), code=cv2.COLOR_BGR2RGB) - - # Resize to MobileNet image shape. - input_image = cv2.resize(src=image, dsize=(224, 224)) - - # Reshape to model input shape. - input_image = np.expand_dims(input_image, 0) - plt.imshow(image) - - -.. parsed-literal:: - - 'data/coco.jpg' already exists. - - - - -.. parsed-literal:: - - - - - - -.. image:: explainable-ai-1-basic-with-output_files/explainable-ai-1-basic-with-output_11_2.png - - -Do Inference ------------- - - - -.. code:: ipython3 - - result_infer = compiled_model([input_image])[0] - result_index = np.argmax(result_infer) - -.. code:: ipython3 - - if not Path("data/imagenet_2012.txt").exists(): - imagenet_filename = download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/datasets/imagenet/imagenet_2012.txt", - directory="data", - ) - - imagenet_classes = imagenet_filename.read_text().splitlines() - -.. code:: ipython3 - - # The model description states that for this model, class 0 is a background. - # Therefore, a background must be added at the beginning of imagenet_classes. - imagenet_classes = ["background"] + imagenet_classes - - print(f"class index: {result_index}") - print(f"class name: {imagenet_classes[result_index]}") - print(f"class score: {result_infer[0][result_index]:.2f}") - - -.. parsed-literal:: - - class index: 206 - class name: n02099267 flat-coated retriever - class score: 0.76 - - -Create Explainer ----------------- - - - -.. code:: ipython3 - - explainer = xai.Explainer( - model=model, - task=xai.Task.CLASSIFICATION, - ) - - -.. parsed-literal:: - - INFO:openvino_xai:Assigning preprocess_fn to identity function assumes that input images were already preprocessed by user before passing it to the model. Please define preprocessing function OR preprocess images beforehand. - INFO:openvino_xai:Target insertion layer is not provided - trying to find it in auto mode. - INFO:openvino_xai:Using ReciproCAM method (for CNNs). - INFO:openvino_xai:Explaining the model in white-box mode. - - -Do Explanation --------------- - - - -Explainer generates explanation which contains saliency maps ({target: -saliency_map}). For classification, targets are indices of the classes. - -.. code:: ipython3 - - explanation = explainer( - data=input_image, - targets=result_index, # can be a single target or a container of targets - label_names=imagenet_classes, # optional, list of label names - overlay=True, # saliency map overlays over the input image, defaults to False - ) - explanation.plot() - - - -.. image:: explainable-ai-1-basic-with-output_files/explainable-ai-1-basic-with-output_19_0.png - - -Note: by default, overlay is applied over the image in the ``data`` -argument. In this case, ``data`` was preprocessed (e.g. resized to -224x224), but still recognizable by human. In order for the overlay to -applied over the original image, provide original image with -``original_image`` argument (please refer to `OpenVINO™ Explainable AI -Toolkit (2/3): Deep Dive `__). - -Above saliency map can help to answer the question: “Which part of the -image mostly contributes to the model predicted class: (206, ‘n02099267 -flat-coated retriever’)?” - -Observing saliency map overlay, it might be concluded that the model is -using the right features (pixels) to make a prediction. - -Save saliency maps for the further visual analysis: - -.. code:: ipython3 - - explanation.save(base_artifacts_dir) diff --git a/docs/notebooks/explainable-ai-1-basic-with-output_files/explainable-ai-1-basic-with-output_11_2.png b/docs/notebooks/explainable-ai-1-basic-with-output_files/explainable-ai-1-basic-with-output_11_2.png deleted file mode 100644 index a8fc791b3e4c52..00000000000000 --- a/docs/notebooks/explainable-ai-1-basic-with-output_files/explainable-ai-1-basic-with-output_11_2.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:55b1a955ec7a4a7394f905837b1a1686d3bb5130565eb9d4901eade821e6757c -size 387941 diff --git a/docs/notebooks/explainable-ai-1-basic-with-output_files/explainable-ai-1-basic-with-output_19_0.png b/docs/notebooks/explainable-ai-1-basic-with-output_files/explainable-ai-1-basic-with-output_19_0.png deleted file mode 100644 index c6484cb68eeb33..00000000000000 --- a/docs/notebooks/explainable-ai-1-basic-with-output_files/explainable-ai-1-basic-with-output_19_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:26d8ae99fc5c2d7573e5243b1711dab83c7f4658aa423d067ebd17fd87d336c5 -size 351476 diff --git a/docs/notebooks/explainable-ai-2-deep-dive-with-output.rst b/docs/notebooks/explainable-ai-2-deep-dive-with-output.rst deleted file mode 100644 index ea314c76e40faf..00000000000000 --- a/docs/notebooks/explainable-ai-2-deep-dive-with-output.rst +++ /dev/null @@ -1,907 +0,0 @@ -OpenVINO™ Explainable AI Toolkit (2/3): Deep Dive -================================================= - -.. warning:: - - Important note: This notebook requires python >= 3.10. Please make - sure that your environment fulfill to this requirement before running - it - -This is the **second notebook** in series of exploring `OpenVINO™ -Explainable AI -(XAI) `__: - -1. `OpenVINO™ Explainable AI Toolkit (1/3): - Basic `__ -2. `OpenVINO™ Explainable AI Toolkit (2/3): Deep - Dive `__ -3. `OpenVINO™ Explainable AI Toolkit (3/3): Saliency map - interpretation `__ - -`OpenVINO™ Explainable AI -(XAI) `__ provides a -suite of XAI algorithms for visual explanation of -`OpenVINO™ `__ Intermediate -Representation (IR) models. - -Using **OpenVINO XAI**, you can generate **saliency maps** that -highlight regions of interest in input images from the model’s -perspective. This helps users understand why complex AI models produce -specific responses. - -This notebook shows an example of how to use OpenVINO XAI, exploring its -methods and functionality. - -It displays a heatmap indicating areas of interest where a neural -network (for classification or detection) focuses before making a -decision. - -Let’s imagine the case that our OpenVINO IR model is up and running on a -inference pipeline. While watching the outputs, we may want to analyze -the model’s behavior for debugging or understanding purposes. - -By using the OpenVINO XAI ``Explainer``, we can visualize why the model -gives such responses, meaning on which areas it focused before -predicting a particular label. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ - - - `Install requirements <#install-requirements>`__ - - `Imports <#imports>`__ - - `Download IR model <#download-ir-model>`__ - - `Load the Image <#load-the-image>`__ - - `Preprocess image for - MobileNet <#preprocess-image-for-mobilenet>`__ - -- `Basic usage: Explainer in AUTO - mode <#basic-usage-explainer-in-auto-mode>`__ - - - `Create Explainer object <#create-explainer-object>`__ - - `Generate explanation <#generate-explanation>`__ - - `Visualize saliency maps <#visualize-saliency-maps>`__ - - `Save saliency maps <#save-saliency-maps>`__ - - `Generate saliency maps for all - classes <#generate-saliency-maps-for-all-classes>`__ - -- `Pre- and post-process - functions <#pre--and-post-process-functions>`__ -- `Visualization Parameters <#visualization-parameters>`__ -- `Explainer in WHITEBOX mode <#explainer-in-whitebox-mode>`__ - - - `ReciproCAM XAI method <#reciprocam-xai-method>`__ - - `Insert XAI branch <#insert-xai-branch>`__ - - `Insertion-related parameters <#insertion-related-parameters>`__ - -- `Explainer in BLACKBOX mode <#explainer-in-blackbox-mode>`__ -- `Advanced <#advanced>`__ - - - `Import ImageNet label names and add them to saliency - maps <#import-imagenet-label-names-and-add-them-to-saliency-maps>`__ - - `Activation map XAI method <#activation-map-xai-method>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Install requirements -~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%capture - - import platform - - # Install openvino package - %pip install -q "openvino>=2024.2.0" opencv-python tqdm scipy - - %pip install -q --no-deps "openvino-xai>=1.1.0" - %pip install -q -U "numpy==1.*" - %pip install -q scipy - - %pip install -q "matplotlib>=3.4" - -Imports -~~~~~~~ - - - -.. code:: ipython3 - - from pathlib import Path - - import cv2 - import matplotlib.pyplot as plt - import numpy as np - - import openvino.runtime as ov - from openvino.runtime.utils.data_helpers.wrappers import OVDict - import openvino_xai as xai - from openvino_xai.explainer import ExplainMode - from openvino_xai.explainer.explanation import Explanation - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import download_file - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("explainable-ai-2-deep-dive.ipynb") - -Download IR model -~~~~~~~~~~~~~~~~~ - - - -In this notebook for demonstration purposes we’ll use an already -converted to IR model from OpenVINO storage. - -.. code:: ipython3 - - base_artifacts_dir = Path("./artifacts").expanduser() - - model_name = "v3-small_224_1.0_float" - model_xml_name = f"{model_name}.xml" - model_bin_name = f"{model_name}.bin" - - model_xml_path = base_artifacts_dir / model_xml_name - - base_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/mobelinet-v3-tf/FP32/" - - if not model_xml_path.exists(): - download_file(base_url + model_xml_name, model_xml_name, base_artifacts_dir) - download_file(base_url + model_bin_name, model_bin_name, base_artifacts_dir) - else: - print(f"{model_name} already downloaded to {base_artifacts_dir}") - -.. code:: ipython3 - - # Create ov.Model - model = ov.Core().read_model(model_xml_path) - -Load the Image -~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # Download the image from the openvino_notebooks storage - if not Path("data/coc.jpg").exists(): - image_filename = download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco.jpg", - directory="data", - ) - - # The MobileNet model expects images in RGB format. - image = cv2.cvtColor(cv2.imread(filename=str(image_filename)), code=cv2.COLOR_BGR2RGB) - plt.imshow(image) - - -.. parsed-literal:: - - 'data/coco.jpg' already exists. - - - - -.. parsed-literal:: - - - - - - -.. image:: explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_11_2.png - - -Preprocess image for MobileNet -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # Resize to MobileNetV3 input image shape. - preprocessed_image = cv2.resize(src=image, dsize=(224, 224)) - # Add batch dimension - preprocessed_image = np.expand_dims(preprocessed_image, 0) - -Basic usage: ``Explainer`` in ``AUTO`` mode -------------------------------------------- - - - -The easiest way to generate saliency maps is to use ``Explainer`` in -``ExplainMode.AUTO`` mode (``AUTO`` mode is used by default). - -Under the hood of ``AUTO`` mode, ``Explainer`` will first try to run the -``WHITEBOX`` mode. If ``WHITEBOX`` fails, it will then run the -``BLACKBOX`` mode as a fallback option. See more details about -`WHITEBOX <#explainer-in-whitebox-mode>`__ and -`BLACKBOX <#explainer-in-blackbox-mode>`__ modes below. - -Generating saliency maps involves model inference. The explainer will -perform model inference, but to do so, it requires ``preprocess_fn`` and -``postprocess_fn``. We can avoid passing ``preprocess_fn`` by -preprocessing (e.g., resizing and adding a batch dimension as shown -above) the input data beforehand - by default, ``preprocess_fn`` is the -identity function. We expect that current example will successfully use -``WHITEBOX`` mode under the hood, therefore we don’t pass -``postprocess_fn`` (``postprocess_fn`` is not required for ``WHITEBOX`` -mode, only for ``BLACKBOX``). - -To learn more about pre- and post-process functions, refer to the `pre- -and post-process functions <#pre--and-post-process-functions>`__ -section. - -Create ``Explainer`` object -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - explainer = xai.Explainer( - model=model, - task=xai.Task.CLASSIFICATION, - ) - - -.. parsed-literal:: - - INFO:openvino_xai:Assigning preprocess_fn to identity function assumes that input images were already preprocessed by user before passing it to the model. Please define preprocessing function OR preprocess images beforehand. - INFO:openvino_xai:Target insertion layer is not provided - trying to find it in auto mode. - INFO:openvino_xai:Using ReciproCAM method (for CNNs). - INFO:openvino_xai:Explaining the model in white-box mode. - - -Generate ``explanation`` -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The predicted class for this model-image pair is -``flat-coated_retriever`` with class index ``206``. So here and further -we will check saliency maps for this index. - -.. code:: ipython3 - - # You can choose class(es) to generate saliency maps for. - # In this notebook we will check maps for predicted class with index 206 - "flat-coated retriever" - retriever_class_index = 206 - -.. code:: ipython3 - - explanation = explainer( - preprocessed_image, - targets=retriever_class_index, # can be a single target or a container of targets - overlay=True, # saliency map overlay over the original image, False by default, set to True for better visual inspection - ) - -Visualize saliency maps -~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - explanation: Explanation - # explanation.saliency_map: Dict[int: np.ndarray] # where key - class id, value - processed saliency map (e.g. 354 x 500 x 3 shape) - - # Check saved saliency maps - print(f"Saliency maps were generated for the following classes: {explanation.targets}") - print(f"Saliency map size: {explanation.shape}") - - # Visualize generated saliency maps for each target class (.plot() supports plotting multiple saliency maps) - explanation.plot() - - -.. parsed-literal:: - - Saliency maps were generated for the following classes: [206] - Saliency map size: (224, 224, 3) - - - -.. image:: explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_22_1.png - - -Save saliency maps -~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # Save saliency map - explanation.save(base_artifacts_dir, "explain_auto_") - -.. code:: ipython3 - - # Plot saved saliency map - image_sal_map = cv2.imread(f"{base_artifacts_dir}/explain_auto_{retriever_class_index}.jpg") - image_sal_map = cv2.cvtColor(image_sal_map, cv2.COLOR_BGR2RGB) - plt.imshow(image_sal_map) - - - - -.. parsed-literal:: - - - - - - -.. image:: explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_25_1.png - - -Generate saliency maps for all classes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To obtain saliency maps for all classes, set ``targets`` to ``None`` or -``-1``. - -.. code:: ipython3 - - explanation = explainer(preprocessed_image, targets=-1) - - # Check saved saliency maps - print(f"Saliency maps were generated for the following classes: {explanation.targets[:5]} ... {explanation.targets[-5:]}") - print(f"Saliency map size: {explanation.shape}") - - -.. parsed-literal:: - - Saliency maps were generated for the following classes: [0, 1, 2, 3, 4] ... [996, 997, 998, 999, 1000] - Saliency map size: (224, 224, 3) - - -Pre- and post-process functions -------------------------------- - - - -The explainer can apply pre-processing internally during model -inference, allowing you to provide a raw image as input to the -explainer. - -To enable this, define ``preprocess_fn`` and provide it to the explainer -constructor. By default, ``preprocess_fn`` is an identity function that -passes the input without any changes, assuming it is preprocessed -beforehand. - -In ``AUTO`` mode, the explainer tries to run the ``WHITEBOX`` mode -first. If it fails, the corresponding exception will be raised, and the -``BLACKBOX`` mode will be enabled as a fallback. - -The ``BLACKBOX`` mode requires access to the output ``logits`` -(activated or not). Therefore, in such cases, ``postprocess_fn`` is -required, which accepts the raw IR model output and returns ``logits`` -(see below for a reference). - -.. code:: ipython3 - - def preprocess_fn(x: np.ndarray) -> np.ndarray: - # Implementing pre-processing based on model's pipeline - x = cv2.resize(src=x, dsize=(224, 224)) - - # Add batch dimension - x = np.expand_dims(x, 0) - return x - - - def postprocess_fn(x: OVDict): - # Implementing post-processing function based on model's pipeline - # Return "logits" model output - return x[0] - -.. code:: ipython3 - - # Create explainer object - explainer = xai.Explainer( - model=model, - task=xai.Task.CLASSIFICATION, - preprocess_fn=preprocess_fn, - postprocess_fn=postprocess_fn, - ) - - explanation = explainer(image, targets=retriever_class_index) - - -.. parsed-literal:: - - INFO:openvino_xai:Target insertion layer is not provided - trying to find it in auto mode. - INFO:openvino_xai:Using ReciproCAM method (for CNNs). - INFO:openvino_xai:Explaining the model in white-box mode. - - -Visualization Parameters ------------------------- - - - -- resize (True by default): If True, resize saliency map to the input - image size. -- colormap (True by default): If True, apply colormap to the grayscale - saliency map. -- overlay (False by default): If True, generate overlay of the saliency - map over the input image. -- original_input_image (None by default): Provide the original, - unprocessed image to apply the overlay. This ensures the overlay is - not applied to a preprocessed image, which may be resized or - normalized and lose readability. -- overlay_weight (0.5 by default): Weight of the saliency map when - overlaying the input data with the saliency map. - -.. code:: ipython3 - - # Create explainer object - explainer = xai.Explainer(model=model, task=xai.Task.CLASSIFICATION) - - # Generate overlayed saliency_map - explanation = explainer( - preprocessed_image, - targets=[retriever_class_index], # target can be a single label index, label name or a list of indices/names - overlay=True, # False by default - original_input_image=image, # to apply overlay on the original image instead of preprocessed one that was used for the explainer - ) - - explanation.plot() - - # Save saliency map - explanation.save(base_artifacts_dir, "overlay_") - - -.. parsed-literal:: - - INFO:openvino_xai:Assigning preprocess_fn to identity function assumes that input images were already preprocessed by user before passing it to the model. Please define preprocessing function OR preprocess images beforehand. - INFO:openvino_xai:Target insertion layer is not provided - trying to find it in auto mode. - INFO:openvino_xai:Using ReciproCAM method (for CNNs). - INFO:openvino_xai:Explaining the model in white-box mode. - - - -.. image:: explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_34_1.png - - -.. code:: ipython3 - - # Generate saliency map without overlay over original image - explanation = explainer( - preprocessed_image, - targets=[retriever_class_index], # target can be a single label index, label name or a list of indices/names - overlay=False, # False by default - ) - - explanation.plot() - - # Save saliency map - explanation.save(base_artifacts_dir, "colormap_") - - - -.. image:: explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_35_0.png - - -.. code:: ipython3 - - # Return low-resolution (raw) gray-scale saliency map - explanation = explainer( - preprocessed_image, - targets=[retriever_class_index], # target can be a single label index, label name or a list of indices/names - resize=False, # True by default - colormap=False, # True by default - ) - - explanation.plot() - - # Save saliency map - explanation.save(base_artifacts_dir, "grayscale_") - - - -.. image:: explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_36_0.png - - -``Explainer`` in ``WHITEBOX`` mode ----------------------------------- - - - -``ReciproCAM`` XAI method -~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -``Explainer`` in ``WHITEBOX`` mode treats the model as a white box and -performs its inner modifications. ``Explainer`` inserts extra XAI nodes -after the backbone to estimate which activations are important for model -prediction. - -If a method is not specified, the XAI branch will be generated using the -`ReciproCAM `__ method. - -By default, the insertion of the XAI branch will be done automatically -by searching for the correct node - ``target_layer`` (``target_layer`` -can be specified manually). - -It works quickly and precisely, requiring only one model inference. - -.. code:: ipython3 - - # Create explainer object - explainer = xai.Explainer( - model=model, - task=xai.Task.CLASSIFICATION, - preprocess_fn=preprocess_fn, - explain_mode=ExplainMode.WHITEBOX, # defaults to ExplainMode.AUTO - explain_method=xai.Method.RECIPROCAM, # ReciproCAM is the default white-box method for CNNs - ) - - -.. parsed-literal:: - - INFO:openvino_xai:Target insertion layer is not provided - trying to find it in auto mode. - INFO:openvino_xai:Using ReciproCAM method (for CNNs). - INFO:openvino_xai:Explaining the model in white-box mode. - - -Insert XAI branch -~~~~~~~~~~~~~~~~~ - - - -It’s possible to update the model with an XAI branch using the -``insert_xai`` functional API. - -``insert_xai`` will return an OpenVINO model with the XAI branch -inserted and an additional ``saliency_map`` output. - -This helps to avoid OpenVINO XAI dependency in the inference -environment. - -**Note**: XAI branch introduce an additional computational overhead -(usually less than a single model forward pass). - -.. code:: ipython3 - - # insert XAI branch - model_xai: ov.Model - model_xai = xai.insert_xai( - model, - task=xai.Task.CLASSIFICATION, - explain_method=xai.Method.RECIPROCAM, - target_layer="MobilenetV3/Conv_1/Conv2D", # optional, by default insert_xai will try to find target_layer automatically - embed_scaling=True, - ) - - -.. parsed-literal:: - - INFO:openvino_xai:Target insertion layer MobilenetV3/Conv_1/Conv2D is provided. - INFO:openvino_xai:Using ReciproCAM method (for CNNs). - INFO:openvino_xai:Insertion of the XAI branch into the model was successful. - - -**Note**: ``insert_xai`` supports both OpenVINO IR and PyTorch models. -See documentation for more details. - -Insertion-related parameters -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -If automatic search for correct node fails, you can set up a correct -node manually with ``target_layer`` argument. For classification, it’s -the last backbone node with shape [1, num_channels, feature_map_height, -feature_map_width]. For example, for the used MobileNetV3 it will be -``MobilenetV3/Conv_1/Conv2D`` layer with [1, 576, 7, 7] output shape. - -To find the right ``target_layer`` for your model, check the name of the -last convolutional node in the backbone using ``.XML`` file (optionally, -use some graph visualization tool, such as Netron). - -``embed_scaling`` **default True** (for speed purposes), this parameter -ensures that saliency map scaling is embedded into the graph, which -results in being able to visualize saliency maps right away without -further postprocessing. - -.. code:: ipython3 - - # Create explainer object - explainer = xai.Explainer( - model=model, - task=xai.Task.CLASSIFICATION, - preprocess_fn=preprocess_fn, - explain_mode=ExplainMode.AUTO, - explain_method=xai.Method.RECIPROCAM, - # target_layer="last_conv_node_name", # target_layer - node after which XAI branch will be inserted - target_layer="MobilenetV3/Conv_1/Conv2D", - embed_scaling=True, # True by default. If set to True, saliency map scale (0 ~ 255) operation is embedded in the model - ) - - -.. parsed-literal:: - - INFO:openvino_xai:Target insertion layer MobilenetV3/Conv_1/Conv2D is provided. - INFO:openvino_xai:Using ReciproCAM method (for CNNs). - INFO:openvino_xai:Explaining the model in white-box mode. - - -``Explainer`` in ``BLACKBOX`` mode ----------------------------------- - - - -``Explainer`` in ``BLACKBOX`` mode treats the model as a black box -without altering its internal structure. Therefore, this method will -work on any model that can be inferred and return class scores as -output. - -While it is convenient to treat every model as a black box for -explanation purposes, black-box method may require a significant number -of inferences (AISE requires 120-500 model inferences). - -Given that the quality of the saliency maps usually correlates with the -number of available inferences, we propose the following presets for the -black-box methods: ``Preset.SPEED``, ``Preset.BALANCE``, -``Preset.QUALITY`` (``Preset.BALANCE`` is used by default). - -AISE (Adaptive Input Sampling for Explanation of Black-box Models) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -AISE is used as a default black-box method. AISE formulates saliency map -generation as a kernel density estimation (KDE) problem, and adaptively -sample input masks using a derivative-free optimizer to maximize mask -saliency score. - -.. code:: ipython3 - - # Create explainer object - explainer = xai.Explainer( - model=model, - task=xai.Task.CLASSIFICATION, - preprocess_fn=preprocess_fn, - postprocess_fn=postprocess_fn, - explain_mode=ExplainMode.BLACKBOX, # defaults to AUTO - ) - - # Generate explanation - explanation = explainer( - image, - targets=retriever_class_index, - overlay=True, - ) - - -.. parsed-literal:: - - INFO:openvino_xai:Explaining the model in black-box mode. - - -.. code:: ipython3 - - # Plot saliency map - explanation.plot() - - # Save saliency map - explanation.save(base_artifacts_dir, "blackbox_aise_") - - - -.. image:: explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_49_0.png - - -RISE (Randomized Input Sampling for Explanation of Black-box Models) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -`RISE `__ probes a model by -sub-sampling the input image via random masks and records its response -to each of them. RISE creates random masks from down-scaled space -(e.g. 7×7 grid) and adds random translation shifts for the pixel-level -explanation with further up-sampling. Weighted sum of all sampled masks -used to generate the fine-grained saliency map. - -.. code:: ipython3 - - # Create explainer object - explainer = xai.Explainer( - model=model, - task=xai.Task.CLASSIFICATION, - preprocess_fn=preprocess_fn, - postprocess_fn=postprocess_fn, - explain_mode=ExplainMode.BLACKBOX, # defaults to AUTO - explain_method=xai.Method.RISE, # xai.Method.AISE is used by default - ) - - # Generate explanation - explanation = explainer( - image, - targets=retriever_class_index, - overlay=True, - ) - -.. code:: ipython3 - - # Plot saliency map - explanation.plot() - - # Save saliency map - explanation.save(base_artifacts_dir, "blackbox_rise_") - - - -.. image:: explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_52_0.png - - -Advanced --------- - - - -Import ImageNet label names and add them to saliency maps -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -If ``label_names`` are not provided to the explainer call, the saved -saliency map will have the predicted class index, not the label name. -For example, ``206.jpg`` instead of ``retriever.jpg``. - -To conveniently view label names in saliency maps, we provide ImageNet -label names information to the explanation call. - -.. code:: ipython3 - - if not Path("data/imagenet_2012.txt").exists(): - - imagenet_filename = download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/datasets/imagenet/imagenet_2012.txt", - directory="data", - ) - - imagenet_classes = imagenet_filename.read_text().splitlines() - - -.. parsed-literal:: - - 'data/imagenet_2012.txt' already exists. - - -.. code:: ipython3 - - imagenet_labels = [] - for label in imagenet_classes: - class_label = " ".join(label.split(" ")[1:]) - first_class_label = class_label.split(",")[0].replace(" ", "_") - imagenet_labels.append(first_class_label) - - print(" ".join(imagenet_labels[:10])) - - -.. parsed-literal:: - - tench goldfish great_white_shark tiger_shark hammerhead electric_ray stingray cock hen ostrich - - -.. code:: ipython3 - - # The model description states that for this model, class 0 is a background. - # Therefore, a background must be added at the beginning of imagenet_classes. - imagenet_labels = ["background"] + imagenet_labels - -.. code:: ipython3 - - # Create explainer object - explainer = xai.Explainer( - model=model, - task=xai.Task.CLASSIFICATION, - preprocess_fn=preprocess_fn, - explain_mode=ExplainMode.WHITEBOX, - ) - - # Adding ImageNet label names. - explanation = explainer( - image, - # Return saliency maps for 2 named labels, possible if label_names is provided - targets=["flat-coated_retriever", "microwave"], # slso label indices [206, 652] are possible as target - label_names=imagenet_labels, - ) - - -.. parsed-literal:: - - INFO:openvino_xai:Target insertion layer is not provided - trying to find it in auto mode. - INFO:openvino_xai:Using ReciproCAM method (for CNNs). - INFO:openvino_xai:Explaining the model in white-box mode. - - -.. code:: ipython3 - - # Save saliency map - explanation.save(base_artifacts_dir, "label_names_") - -Below in ``base_artifacts_dir / "label_names"`` you can see saved -saliency maps with label name on it: - -.. code:: ipython3 - - # See saliency mas saved in `output` with predicted label in image name - for file_name in base_artifacts_dir.glob("label_names_*"): - print(file_name) - - -.. parsed-literal:: - - artifacts/label_names_microwave.jpg - artifacts/label_names_flat-coated_retriever.jpg - - -Activation map XAI method -~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The Activation Map method shows a general attention map without respect -to specific classes. It can be useful for understanding which areas the -model identifies as important. - -If the explanation method is set to ``Method.ACTIVATIONMAP``, instead of -saliency maps for each class, the activation map is returned as -``explanation.saliency_map["per_image_map"]``. - -.. code:: ipython3 - - # Create explainer object - explainer = xai.Explainer( - model=model, - task=xai.Task.CLASSIFICATION, - preprocess_fn=preprocess_fn, - explain_mode=ExplainMode.WHITEBOX, - explain_method=xai.Method.ACTIVATIONMAP, - ) - - explanation = explainer(image, overlay=True) - explanation.plot() - - -.. parsed-literal:: - - INFO:openvino_xai:Target insertion layer is not provided - trying to find it in auto mode. - INFO:openvino_xai:Using ActivationMap method (for CNNs). - INFO:openvino_xai:Explaining the model in white-box mode. - - - -.. image:: explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_63_1.png - diff --git a/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_11_2.png b/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_11_2.png deleted file mode 100644 index a8fc791b3e4c52..00000000000000 --- a/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_11_2.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:55b1a955ec7a4a7394f905837b1a1686d3bb5130565eb9d4901eade821e6757c -size 387941 diff --git a/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_22_1.png b/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_22_1.png deleted file mode 100644 index 8822fe615f6f19..00000000000000 --- a/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_22_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:83b40d8eb0d4a89c7c24a9cc676e3b4f298e5eabdb7a9d5a5604d4c8533dca7f -size 342254 diff --git a/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_25_1.png b/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_25_1.png deleted file mode 100644 index 125556adbb530c..00000000000000 --- a/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_25_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:354fa33f0a0e6becba16a5c65dde256282215e75a1e379500fd9e9d5fed7845e -size 235673 diff --git a/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_34_1.png b/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_34_1.png deleted file mode 100644 index 104fcabc090172..00000000000000 --- a/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_34_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:14efddd662af5ecae15f38d9fa20e001d7c1f1f26418d3a89ea0f489a5aee993 -size 312661 diff --git a/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_35_0.png b/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_35_0.png deleted file mode 100644 index 60fdb91b059005..00000000000000 --- a/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_35_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cae625a36aeb18c1fd5f5b7e673b7e58836f8bf914b43906cb4e5c81cb33885f -size 62966 diff --git a/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_36_0.png b/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_36_0.png deleted file mode 100644 index 1c0f2dbbeb4f3a..00000000000000 --- a/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_36_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c2596ddca7816d246bea422e0c9b809a41feaf65163d6c27c0b422ba6f16a440 -size 4947 diff --git a/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_49_0.png b/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_49_0.png deleted file mode 100644 index 2c5b7a96ca9399..00000000000000 --- a/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_49_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b0d33ea52b17c068fb6e2bff4ad0d4f0993e38a6a074a3ac1af3ccaefec2199d -size 326076 diff --git a/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_52_0.png b/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_52_0.png deleted file mode 100644 index 6fa958fe614823..00000000000000 --- a/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_52_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d41ea03631f0daa7400793e57138b4af52e13dc5294a3440688dd27a5034215e -size 324115 diff --git a/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_63_1.png b/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_63_1.png deleted file mode 100644 index 98255daa2893ee..00000000000000 --- a/docs/notebooks/explainable-ai-2-deep-dive-with-output_files/explainable-ai-2-deep-dive-with-output_63_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9e60ccfb5582f57e813fbcd3ee55c35677f3cb391e7a4eb67b10319ee911f341 -size 314537 diff --git a/docs/notebooks/explainable-ai-3-map-interpretation-with-output.rst b/docs/notebooks/explainable-ai-3-map-interpretation-with-output.rst deleted file mode 100644 index 4017a21f036edf..00000000000000 --- a/docs/notebooks/explainable-ai-3-map-interpretation-with-output.rst +++ /dev/null @@ -1,839 +0,0 @@ -OpenVINO™ Explainable AI Toolkit (3/3): Saliency map interpretation -=================================================================== - -.. warning:: - - Important note: This notebook requires python >= 3.10. Please make - sure that your environment fulfill to this requirement before running - it - -This is the **third notebook** in series of exploring `OpenVINO™ -Explainable AI -(XAI) `__: - -1. `OpenVINO™ Explainable AI Toolkit (1/3): - Basic `__ -2. `OpenVINO™ Explainable AI Toolkit (2/3): Deep - Dive `__ -3. `OpenVINO™ Explainable AI Toolkit (3/3): Saliency map - interpretation `__ - -`OpenVINO™ Explainable AI -(XAI) `__ provides a -suite of XAI algorithms for visual explanation of -`OpenVINO™ `__ Intermediate -Representation (IR) models. - -Using **OpenVINO XAI**, you can generate **saliency maps** that -highlight regions of interest in input images from the model’s -perspective. This helps users understand why complex AI models produce -specific responses. - -This notebook shows how to use saliency maps to evaluate and debug model -reasoning. - -For example, it might be important to ensure that the model is using -relevant features (pixels) to make a correct prediction (e.g., it might -be desirable that the model is not relying on X class features to -predict Y class). On the other hand, it is valuable to observe which -features are used when the model is wrong. - -Below, we present examples of saliency map analysis for the following -cases: correct and highly-confident prediction, correct and -low-confident prediction, and wrong prediction. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ - - - `Install requirements <#install-requirements>`__ - - `Imports <#imports>`__ - - `Download dataset <#download-dataset>`__ - - `Download IR model <#download-ir-model>`__ - -- `Prepare model to run inference <#prepare-model-to-run-inference>`__ - - - `Select inference device <#select-inference-device>`__ - - `Load the Model <#load-the-model>`__ - - `Define preprocess_fn <#define-preprocess_fn>`__ - -- `Explain <#explain>`__ - - - `Create Explainer object <#create-explainer-object>`__ - - `Import ImageNet label names <#import-imagenet-label-names>`__ - - `Explain using ImageNet labels <#explain-using-imagenet-labels>`__ - -- `Notable use cases in ImageWoof - dataset <#notable-use-cases-in-imagewoof-dataset>`__ - - - `Explain for each use case <#explain-for-each-use-case>`__ - - `Visualize use case saliency - maps <#visualize-use-case-saliency-maps>`__ - - `Naming logic <#naming-logic>`__ - -- `Results <#results>`__ - - - `True Positive High confidence <#true-positive-high-confidence>`__ - - `True Positive Low confidence <#true-positive-low-confidence>`__ - - `False Positive High - confidence <#false-positive-high-confidence>`__ - - `Two mixed predictions <#two-mixed-predictions>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Install requirements -~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%capture - - import platform - - # Install openvino package - %pip install -q "openvino>=2024.2.0" opencv-python tqdm - - # Install openvino xai package - %pip install -q --no-deps "openvino-xai>=1.1.0" - %pip install -q -U "numpy==1.*" - %pip install -q scipy - - %pip install -q "matplotlib>=3.4" - -Imports -~~~~~~~ - - - -.. code:: ipython3 - - import os - import zipfile - from pathlib import Path - - import cv2 - import matplotlib.pyplot as plt - import numpy as np - import requests - - import openvino.runtime as ov - import openvino_xai as xai - from openvino_xai.explainer import ExplainMode - - if not Path("notebook_utils.py").exists(): - # Fetch `notebook_utils` module - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("explainable-ai-3-map-interpretation.ipynb") - -Download dataset -~~~~~~~~~~~~~~~~ - - - -To see examples of saliency maps for different use cases, please -download the `ImageWoof -dataset `__ using the -code below. - -ImageWoof is a subset of 10 classes from ImageNet that are tricky to -classify since they’re all dog breeds. - -.. code:: ipython3 - - base_artifacts_dir = Path("./artifacts").expanduser() - data_folder = base_artifacts_dir / ".data" - -.. code:: ipython3 - - # Download 330 MB of 320 px ImageNet subset with dog breeds - if not (data_folder / "imagewoof320").exists(): - download_file( - "https://ultralytics.com/assets/imagewoof320.zip", - directory=data_folder, - ) - - # Define the path to the zip file and the destination directory - zip_path = data_folder / "imagewoof320.zip" - extract_dir = data_folder / "imagewoof320" - with zipfile.ZipFile(zip_path, "r") as zip_ref: - zip_ref.extractall(extract_dir) - - else: - print(f"Dataset is already downloaded to {base_artifacts_dir} and extracted.") - - image_folder_path = data_folder / "imagewoof320" / "imagewoof320" - -.. code:: ipython3 - - # Create list of images to explain - img_files = [] - img_files.extend(image_folder_path.rglob("*.JPEG")) - print(f"Number of images to get explanations: {len(img_files)}") - - # Get a fewer subset for fast execution - np.random.seed(42) - img_files = np.random.choice(img_files, 1) - print(f"Run explanations on fewer number of images: {len(img_files)}") - - -.. parsed-literal:: - - Number of images to get explanations: 12954 - Run explanations on fewer number of images: 1 - - -Download IR model -~~~~~~~~~~~~~~~~~ - - - -In this notebook, for demonstration purposes, we’ll use an already -converted to IR model ``mobilenetv3_large_100.ra_in1k``, from -`timm `__ (PyTorch -Image Models). This model requires specific preprocessing, including -scaling and normalization with certain values. - -.. code:: ipython3 - - model_name = "mobilenetv3_large_100.ra_in1k" - model_xml_name = f"{model_name}.xml" - model_bin_name = f"{model_name}.bin" - - model_xml_path = base_artifacts_dir / model_xml_name - - base_url = "https://storage.openvinotoolkit.org/repositories/openvino_training_extensions/models/custom_image_classification/" - - if not model_xml_path.exists(): - download_file(base_url + model_xml_name, model_xml_name, base_artifacts_dir) - download_file(base_url + model_bin_name, model_bin_name, base_artifacts_dir) - else: - print(f"{model_name} already downloaded to {base_artifacts_dir}") - -Prepare model to run inference ------------------------------- - - - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device = device_widget() - - device - -Load the Model -~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - core = ov.Core() - - model = core.read_model(model=model_xml_path) - compiled_model = core.compile_model(model=model, device_name=device.value) - -Define ``preprocess_fn`` -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -This notebook using ``WHITEBOX`` mode for model explanation - it is -required to define function to preprocess data (the alternative is to -preprocess input data). Since the used model is originally from `timm -storage `__, it is -required to apply specific timm preprocessing, including normalization -and scaling with certain values. - -.. code:: ipython3 - - def preprocess_fn(x: np.ndarray) -> np.ndarray: - """ - Implementing own pre-process function based on model's implementation - """ - x = cv2.resize(src=x, dsize=(224, 224)) - - # Specific normalization for timm model - mean = np.array([123.675, 116.28, 103.53]) - std = np.array([58.395, 57.12, 57.375]) - x = (x - std) / mean - - # Reshape to model input shape to [channels, height, width]. - x = x.transpose((2, 0, 1)) - - # Add batch dimension - x = np.expand_dims(x, 0) - return x - -Explain -------- - - - -Create ``Explainer`` object -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The ``Explainer`` object can internally apply pre-processing during -model inference, allowing raw images as input. To enable this, define -``preprocess_fn`` and provide it to the explainer constructor. If -``preprocess_fn`` is not defined, it is assumed that the input is -preprocessed. - -.. code:: ipython3 - - # Create ov.Model - model = core.read_model(model=model_xml_path) - - # Create explainer object - explainer = xai.Explainer( - model=model, - task=xai.Task.CLASSIFICATION, - preprocess_fn=preprocess_fn, - explain_mode=ExplainMode.WHITEBOX, - ) - - -.. parsed-literal:: - - INFO:openvino_xai:Target insertion layer is not provided - trying to find it in auto mode. - INFO:openvino_xai:Using ReciproCAM method (for CNNs). - INFO:openvino_xai:Explaining the model in white-box mode. - - -Import ImageNet label names -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -If ``label_names`` are not provided to the explainer call, the saved -saliency map will have the predicted class index, not the name. For -example, ``167.jpg`` instead of ``English_foxhound.jpg``. - -To conveniently view label names in saliency maps, we prepare and -provide ImageNet label names information to the explanation call. - -.. code:: ipython3 - - %%capture - imagenet_filename = Path(".data/imagenet_2012.txt") - if not imagenet_filename.exists(): - imagenet_filename = download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/datasets/imagenet/imagenet_2012.txt", - directory=".data", - ) - - imagenet_classes = imagenet_filename.read_text().splitlines() - -.. code:: ipython3 - - # Get ImageNet label names to add them to explanations - imagenet_labels = [] - for label in imagenet_classes: - class_label = " ".join(label.split(" ")[1:]) - first_class_label = class_label.split(",")[0].replace(" ", "_") - imagenet_labels.append(first_class_label) - - # Check, how dog breed labels will look in saved saliency map names - dog_breeds_indices = [155, 159, 162, 167, 193, 207, 229, 258, 273] - print(" ".join([imagenet_labels[ind] for ind in dog_breeds_indices])) - - -.. parsed-literal:: - - Shih-Tzu Rhodesian_ridgeback beagle English_foxhound Australian_terrier golden_retriever Old_English_sheepdog Samoyed dingo - - -Explain using ImageNet labels -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To use ImageNet label names, pass them as the ``label_names`` argument -to the explainer. - -.. code:: ipython3 - - output = base_artifacts_dir / "saliency_maps" / "multiple_images" - - # Explain model and save results using ImageNet label names - for image_path in img_files: - image = cv2.imread(str(image_path)) - explanation = explainer( - image, - targets=[ - "flat-coated_retriever", - "Samoyed", - ], # also label indices [206, 258] are possible as target - label_names=imagenet_labels, - ) - explanation.save(output, f"{Path(image_path).stem}_") # pass prefix name with underscore - -Below in ``base_artifacts_dir / "saliency_maps" / "multiple_images"`` -you can see saved saliency maps: - -.. code:: ipython3 - - # See saliency that was saved in `output` with predicted label in image name - for file_name in output.glob("*"): - print(file_name) - - -.. parsed-literal:: - - artifacts/saliency_maps/multiple_images/n02088364_5768_Samoyed.jpg - artifacts/saliency_maps/multiple_images/n02088364_5768_flat-coated_retriever.jpg - - -Notable use cases in ImageWoof dataset --------------------------------------- - - - -Below are a few examples chosen to show cases when: - The correct class -was predicted with high confidence (``True Positive, high confidence``) -- The correct class was predicted, but with low confidence for some -reason (``True Positive, low confidence``) - The predicted class has -high confidence but was incorrect, one class was treated as another -(``False positive, high confidence``) - Two classes were predicted with -similar high confidence, with different saliency maps for each -(``Two predictions``) - -The cell below contains paths to images with those respective use cases: - -.. code:: ipython3 - - # Read paths to ImegeWoof pictures with notable use cases - use_cases_image_paths = { - "True_positive_high_confidence": { - "confidence": 0.79, - "paths": [ - "train/n02088364/n02088364_2019.JPEG", - "train/n02099601/n02099601_6505.JPEG", - "train/n02105641/n02105641_817.JPEG", - "train/n02111889/n02111889_17737.JPEG", - ], - }, - "True_positive_low_confidence": { - "confidence": 0.15, - "paths": [ - "train/n02086240/n02086240_1765.JPEG", - "val/n02086240/n02086240_1422.JPEG", - "train/n02086240/n02086240_3709.JPEG", - "val/n02099601/n02099601_7942.JPEG", - ], - }, - "False_positive_high_confidence": { - "confidence": 0.60, - "paths": [ - "train/n02087394/n02087394_6357.JPEG", - "val/n02088364/n02088364_2430.JPEG", - "train/n02088364/n02088364_12304.JPEG", - "train/n02096294/n02096294_2323.JPEG", - "train/n02099601/n02099601_4933.JPEG", - "val/n02111889/n02111889_1931.JPEG", - "train/n02111889/n02111889_14926.JPEG", - "val/n02115641/n02115641_5752.JPEG", - ], - }, - "True_positive_two_predictions": {"confidence": 0.17, "paths": ["train/n02099601/n02099601_634.JPEG", "train/n02111889/n02111889_374.JPEG"]}, - } - -.. code:: ipython3 - - # Add mapping from folder name to label and label_idx to define the ground-truth label - label_mapping = { - "n02088364": ("beagle", 162), - "n02099601": ("golden retriever", 207), - "n02105641": ("Old English sheepdog", 229), - "n02111889": ("Samoyed", 258), - "n02086240": ("Shih-Tzu", 155), - "n02089973": ("English foxhound", 167), - "n02087394": ("Rhodesian ridgeback", 159), - "n02096294": ("Australian terrier", 193), - "n02115641": ("dingo", 273), - } - -.. code:: ipython3 - - def get_model_predictions(conf_thr: float = 0.1) -> tuple[np.ndarray, np.ndarray, np.ndarray]: - """ - Run model inference and get predictions above a confidence threshold. - - Args: - conf_thr (float): Confidence threshold for filtering predictions. Defaults to 0.1. - - Returns: - tuple: A tuple containing: - - result_infer (np.ndarray): The raw inference results from the model. - - result_idxs (np.ndarray): Indices of the predictions above the confidence threshold. - - result_scores (np.ndarray): Scores of the predictions above the confidence threshold. - """ - logits = compiled_model([preprocess_fn(image)])[0] - result_infer = postprocess_fn(logits) - result_idxs = np.argwhere(result_infer > conf_thr).flatten() - result_scores = result_infer[result_idxs] - - for index, score in zip(result_idxs, result_scores): - print(f"Predicted class {imagenet_labels[index]}, index {index}, probability: {score:.2f}") - - return result_infer, result_idxs, result_scores - - - def postprocess_fn(x: np.ndarray) -> np.ndarray: - """ - Process model prediction - """ - prediction_processed = softmax(x) - return prediction_processed[0] # remove batch dimension - - - def softmax(x): - """Compute softmax values of x.""" - e_x = np.exp(x - np.max(x)) - return e_x / e_x.sum() - -Explain for each use case -~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - output = base_artifacts_dir / "saliency_maps" / "imagewoof320" - - # Run explanation for chosen paths - for use_case in use_cases_image_paths: - os.makedirs(output / use_case, exist_ok=True) - image_paths = use_cases_image_paths[use_case]["paths"] - use_case_conf_thr = use_cases_image_paths[use_case]["confidence"] - - for image_path in image_paths: - image = cv2.imread(str(image_folder_path / image_path)) - image_name = Path(image_path).stem - - folder_name = image_name.split("_")[0] - gt_class, gt_class_idx = label_mapping[folder_name] - - scores, result_idxs, result_scores = get_model_predictions(use_case_conf_thr) - gt_conf = scores[gt_class_idx] - gt_info = f"gt_{gt_class}_{gt_conf:.2f}" - - explanation = explainer( - image, - targets=result_idxs, # return saliency maps for predicted classes - label_names=imagenet_labels, - overlay=True, - ) - - saliency_map_name_prefix = f"{image_name}_{gt_info}_pr_" - saliency_map_name_postfix = "_" - confidence_scores = {} - for idx, score in zip(result_idxs, result_scores): - confidence_scores[idx] = score - explanation.save( - dir_path=(output / use_case), - prefix=saliency_map_name_prefix, - postfix=saliency_map_name_postfix, - confidence_scores=confidence_scores, - ) - - -.. parsed-literal:: - - Predicted class beagle, index 162, probability: 0.97 - Predicted class golden_retriever, index 207, probability: 0.88 - Predicted class Old_English_sheepdog, index 229, probability: 0.96 - Predicted class Samoyed, index 258, probability: 0.94 - Predicted class Shih-Tzu, index 155, probability: 0.18 - Predicted class Shih-Tzu, index 155, probability: 0.18 - Predicted class Shih-Tzu, index 155, probability: 0.20 - Predicted class golden_retriever, index 207, probability: 0.18 - Predicted class dalmatian, index 251, probability: 0.98 - Predicted class bannister, index 421, probability: 0.78 - Predicted class car_mirror, index 475, probability: 0.82 - Predicted class quilt, index 750, probability: 0.80 - Predicted class bubble, index 971, probability: 0.79 - Predicted class dogsled, index 537, probability: 0.79 - Predicted class Arctic_fox, index 279, probability: 0.95 - Predicted class Chihuahua, index 151, probability: 0.93 - Predicted class golden_retriever, index 207, probability: 0.30 - Predicted class Labrador_retriever, index 208, probability: 0.57 - Predicted class Samoyed, index 258, probability: 0.43 - Predicted class crib, index 520, probability: 0.39 - - -.. code:: ipython3 - - # Check saved saliency maps for debugging purposes - for use_case in use_cases_image_paths: - print("\n", use_case) - for file_name in (output / use_case).glob("*"): - print(file_name.stem) - - -.. parsed-literal:: - - - True_positive_high_confidence - n02088364_2019_gt_beagle_0.97_pr_beagle_0.97 - n02099601_6505_gt_golden retriever_0.88_pr_golden_retriever_0.88 - n02105641_817_gt_Old English sheepdog_0.96_pr_Old_English_sheepdog_0.96 - n02111889_17737_gt_Samoyed_0.94_pr_Samoyed_0.94 - - True_positive_low_confidence - n02099601_7942_gt_golden retriever_0.18_pr_golden_retriever_0.18 - n02086240_3709_gt_Shih-Tzu_0.20_pr_Shih-Tzu_0.20 - n02086240_1422_gt_Shih-Tzu_0.18_pr_Shih-Tzu_0.18 - n02086240_1765_gt_Shih-Tzu_0.18_pr_Shih-Tzu_0.18 - - False_positive_high_confidence - n02088364_12304_gt_beagle_0.01_pr_car_mirror_0.82 - n02088364_2430_gt_beagle_0.00_pr_bannister_0.78 - n02099601_4933_gt_golden retriever_0.05_pr_bubble_0.79 - n02096294_2323_gt_Australian terrier_0.00_pr_quilt_0.80 - n02115641_5752_gt_dingo_0.02_pr_Chihuahua_0.93 - n02111889_1931_gt_Samoyed_0.07_pr_dogsled_0.79 - n02087394_6357_gt_Rhodesian ridgeback_0.00_pr_dalmatian_0.98 - n02111889_14926_gt_Samoyed_0.03_pr_Arctic_fox_0.95 - - True_positive_two_predictions - n02099601_634_gt_golden retriever_0.30_pr_golden_retriever_0.30 - n02111889_374_gt_Samoyed_0.43_pr_Samoyed_0.43 - n02099601_634_gt_golden retriever_0.30_pr_Labrador_retriever_0.57 - n02111889_374_gt_Samoyed_0.43_pr_crib_0.39 - - -See the list of use case names: - -.. code:: ipython3 - - print(f"Names of use cases: {list(use_cases_image_paths.keys())}") - - -.. parsed-literal:: - - Names of use cases: ['True_positive_high_confidence', 'True_positive_low_confidence', 'False_positive_high_confidence', 'True_positive_two_predictions'] - - -Visualize use case saliency maps -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The function below helps to visualize the results by creating a matrix -of pictures, their names, and the confidence of predictions: - -.. code:: ipython3 - - # Function to show result saliency maps for each use case - def show_use_case_image(use_case): - use_case_output_dir = output / use_case - - image_paths = sorted(os.listdir(use_case_output_dir)) - number_images = len(image_paths) - - fig, axs = plt.subplots((number_images + 1) // 2, 2, figsize=(10, 10)) - fig.tight_layout() - fig.suptitle(use_case) - fig.subplots_adjust(top=0.92) - axs = axs.flatten() - - for image_path, ax in zip(image_paths, axs): - image_sal_map = cv2.imread(f"{use_case_output_dir}/{image_path}") - image_sal_map = cv2.cvtColor(image_sal_map, cv2.COLOR_BGR2RGB) - - image_name = Path(image_path).stem - image_name = image_name.replace("_target", "") - image_name = "_".join(image_name.split("_")[1:]) - - ax.imshow(image_sal_map) - ax.set_title(f"{image_name}", wrap=True) - ax.axis("off") - - if number_images % 2 == 1: - axs[-1].set_visible(False) - - plt.show() - -Naming logic -~~~~~~~~~~~~ - - - -The name of saved saliency maps in this notebook contains info about the -ground-truth class, predicted class, and its confidence. For better -understanding, let’s split the name into meaningful parts and learn its -meaning, taking ``5752_gt_dingo_0.18_pr_Chihuahua_0.93`` as an example. - -``5752`` - the number of the image, truncated from the original name -``n02115641_5752``. - -``gt_dingo_0.18`` - ``ground-truth`` info, the image was annotated as -the ``dingo`` class, and the model predicted this class with ``0.18`` -confidence. - -``pr_Chihuahua_0.93`` - ``predicted`` info, the winning class is -``Chihuahua``, and the model made this prediction with ``0.93`` -confidence. - -Results -------- - - - -True Positive High confidence -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - show_use_case_image("True_positive_high_confidence") - - - -.. image:: explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_50_0.png - - -In the case of ``True positive high confidence``, the model predicts the -correct class and is confident about its prediction. - -The saliency map highlights features that strongly contribute to the -correct class, meaning that those features are very salient for the -current class. We want to roughly estimate that the highlighted features -are correct. From the above images, we see that the dog’s face, nose, -ears, and the general shape of the dog’s body usually contain the -strongest features for the model. That correlates with our common -knowledge and points to the fact that the model is well-trained and -focuses on the needed areas. - -Another sign that the model learns the right features is that the -classes are well distinguished by the model. Cat features are not used -at all to predict ``Samoyed`` in image ``17737``, which is the desired -behavior. - -True Positive Low confidence -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - show_use_case_image("True_positive_low_confidence") - - - -.. image:: explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_53_0.png - - -``True positive low confidence`` basically means that key features are -not well available or are transformed. From the saliency maps, we see -that the model is paying attention to the whole object, trying to make a -decision mostly based on high-level features. - -False Positive High confidence -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - show_use_case_image("False_positive_high_confidence") - - - -.. image:: explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_56_0.png - - -Here we see a few different reasons why the model can predict one class -instead of another: - -- There are objects of two classes represented in the image, and one - class is much more obvious than the other. For example, it’s larger - or in the foreground. We can see this in the image ``2430`` - (``bannister`` instead of ``beagle``), ``1931`` (``dogsled`` instead - of ``samoyed``), ``2323`` (``quilt`` instead of - ``Australian terrier``), ``12304`` (``car mirror`` instead of - ``beagle``). - - We can see that it’s not the problem of the model but rather the - characteristic of the picture itself. In multiclass classification - with only one annotated class in the image (and softmax applied to - the model), this can happen if features of the wrong class dominate - the features of the right class. Also, this might indicate a labeling - error. - -- Two classes look similar in specific shooting settings. - - In the picture ``5752``, the big ``dingo`` dog was confused with a - small ``chihuahua``, focusing only on the face features. In the - picture ``14926``, a sleeping ``samoyed`` was confused with an - ``arctic fox`` because the sleeping position distorted the key - features, making the classes look even more alike than usual. In the - picture ``6357``, shadows created a pattern on the dog, so the model - found key features for the ``dalmatian`` class and predicted it with - high confidence. - -As a result, we see that the model is well-trained and mixes classes -only because of intricate shooting conditions and the presence of more -than one class in the picture. - -Two mixed predictions -~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - show_use_case_image("True_positive_two_predictions") - - - -.. image:: explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_59_0.png - - -Here are examples where two classes are predicted with relatively high -confidence, and the model is sure about both of them. We can see how -saliency maps are different for each class. - -In the picture ``634``, the model can’t decide between -``golden retriever`` and ``labrador``, focusing on the whole face shape. - -In the image ``374``, both ``samoyed`` and ``crib`` are well-seen, so -the model cannot decide between these two classes. We clearly see the -different areas of interest for each of these classes. diff --git a/docs/notebooks/explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_50_0.png b/docs/notebooks/explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_50_0.png deleted file mode 100644 index 1a9f33c3368b17..00000000000000 --- a/docs/notebooks/explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_50_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cbb403d4ab869af3d0d82a3a7980192f2da68c7df2fe27ebd34340465592f46e -size 974504 diff --git a/docs/notebooks/explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_53_0.png b/docs/notebooks/explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_53_0.png deleted file mode 100644 index 62018be36e4c3a..00000000000000 --- a/docs/notebooks/explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_53_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ad8213f746e218068621812a53f4ec4337fb1d0f8fcc763566e5f9e4251cbcb2 -size 917046 diff --git a/docs/notebooks/explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_56_0.png b/docs/notebooks/explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_56_0.png deleted file mode 100644 index 40b6043e87691d..00000000000000 --- a/docs/notebooks/explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_56_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:12061dc812cec3219863c5936441baa8ce5b86015acf978ca3d4dcf1212b9c02 -size 681815 diff --git a/docs/notebooks/explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_59_0.png b/docs/notebooks/explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_59_0.png deleted file mode 100644 index a04779fd42ed48..00000000000000 --- a/docs/notebooks/explainable-ai-3-map-interpretation-with-output_files/explainable-ai-3-map-interpretation-with-output_59_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3f48b01dde2edffcea65f08fafe13ff158314f7e75534bbc5d7af027cbc43f5e -size 746506 diff --git a/docs/notebooks/fast-segment-anything-with-output.rst b/docs/notebooks/fast-segment-anything-with-output.rst deleted file mode 100644 index 1806bfd2eaea26..00000000000000 --- a/docs/notebooks/fast-segment-anything-with-output.rst +++ /dev/null @@ -1,907 +0,0 @@ -Object segmentations with FastSAM and OpenVINO -============================================== - -`The Fast Segment Anything Model -(FastSAM) `__ is a -real-time CNN-based model that can segment any object within an image -based on various user prompts. ``Segment Anything`` task is designed to -make vision tasks easier by providing an efficient way to identify -objects in an image. FastSAM significantly reduces computational demands -while maintaining competitive performance, making it a practical choice -for a variety of vision tasks. - -FastSAM is a model that aims to overcome the limitations of the `Segment -Anything Model (SAM) `__, -which is a Transformer model that requires significant computational -resources. FastSAM tackles the segment anything task by dividing it into -two consecutive stages: all-instance segmentation and prompt-guided -selection. - -In the first stage, -`YOLOv8-seg `__ is used -to produce segmentation masks for all instances in the image. In the -second stage, FastSAM outputs the region-of-interest corresponding to -the prompt. - -.. figure:: https://user-images.githubusercontent.com/26833433/248551984-d98f0f6d-7535-45d0-b380-2e1440b52ad7.jpg - :alt: pipeline - - pipeline - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ - - - `Install requirements <#install-requirements>`__ - - `Imports <#imports>`__ - -- `FastSAM in Ultralytics <#fastsam-in-ultralytics>`__ -- `Convert the model to OpenVINO Intermediate representation (IR) - format <#convert-the-model-to-openvino-intermediate-representation-ir-format>`__ -- `Embedding the converted models into the original - pipeline <#embedding-the-converted-models-into-the-original-pipeline>`__ - - - `Select inference device <#select-inference-device>`__ - - `Adapt OpenVINO models to the original - pipeline <#adapt-openvino-models-to-the-original-pipeline>`__ - -- `Optimize the model using NNCF Post-training Quantization - API <#optimize-the-model-using-nncf-post-training-quantization-api>`__ - - - `Compare the performance of the Original and Quantized - Models <#compare-the-performance-of-the-original-and-quantized-models>`__ - -- `Try out the converted pipeline <#try-out-the-converted-pipeline>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Install requirements -~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %pip install -q "ultralytics==8.3.59" "matplotlib>=3.4" "onnx<1.16.2" tqdm --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "openvino>=2024.4.0" - %pip install -q "nncf>=2.9.0" - %pip install -q "gradio>=4.13" - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - - -Imports -~~~~~~~ - - - -.. code:: ipython3 - - import ipywidgets as widgets - from pathlib import Path - - import openvino as ov - import torch - from PIL import Image - from ultralytics import FastSAM - - # Fetch skip_kernel_extension module - import requests - - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - if not Path("notebook_utils.py").exists(): - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file, device_widget - - %load_ext skip_kernel_extension - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("fast-segment-anything.ipynb") - -FastSAM in Ultralytics ----------------------- - - - -To work with `Fast Segment Anything -Model `__ by -``CASIA-IVA-Lab``, we will use the `Ultralytics -package `__. Ultralytics package exposes -the ``FastSAM`` class, simplifying the model instantiation and weights -loading. The code below demonstrates how to initialize a ``FastSAM`` -model and generate a segmentation map. - -.. code:: ipython3 - - model_name = "FastSAM-x" - model = FastSAM(model_name) - - image_uri = Path("coco_bile.jpg") - if not image_uri.exists(): - # Run inference on an image - image_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bike.jpg" - image_uri = download_file(image_url) - results = model(image_uri, device="cpu", retina_masks=True, imgsz=1024, conf=0.6, iou=0.9) - - -.. parsed-literal:: - - Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/FastSAM-x.pt to 'FastSAM-x.pt'... - - -.. parsed-literal:: - - 100%|██████████| 138M/138M [00:07<00:00, 19.6MB/s] - - - -.. parsed-literal:: - - coco_bike.jpg: 0%| | 0.00/182k [00:00=2024.5.0'] not found, attempting AutoUpdate... - requirements: ❌ AutoUpdate skipped (offline) - - OpenVINO: starting export with openvino 2024.4.0-16579-c3152d32c9c-releases/2024/4... - OpenVINO: export success ✅ 7.3s, saved as 'FastSAM-x_openvino_model/' (276.1 MB) - - Export complete (9.4s) - Results saved to /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/fast-segment-anything - Predict: yolo predict task=segment model=FastSAM-x_openvino_model imgsz=1024 - Validate: yolo val task=segment model=FastSAM-x_openvino_model imgsz=1024 data=ultralytics/datasets/sa.yaml - Visualize: https://netron.app - - -Embedding the converted models into the original pipeline ---------------------------------------------------------- - - - -OpenVINO™ Runtime Python API is used to compile the model in OpenVINO IR -format. The -`Core `__ -class provides access to the OpenVINO Runtime API. The ``core`` object, -which is an instance of the ``Core`` class represents the API and it is -used to compile the model. - -.. code:: ipython3 - - core = ov.Core() - -Select inference device -^^^^^^^^^^^^^^^^^^^^^^^ - - - -Select device that will be used to do models inference using OpenVINO -from the dropdown list: - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Adapt OpenVINO models to the original pipeline -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Here we create wrapper classes for the OpenVINO model that we want to -embed in the original inference pipeline. Here are some of the things to -consider when adapting an OV model: - Make sure that parameters passed -by the original pipeline are forwarded to the compiled OV model -properly; sometimes the OV model uses only a portion of the input -arguments and some are ignored, sometimes you need to convert the -argument to another data type or unwrap some data structures such as -tuples or dictionaries. - Guarantee that the wrapper class returns -results to the pipeline in an expected format. In the example below you -can see how we pack OV model outputs into a tuple of ``torch`` tensors. -- Pay attention to the model method used in the original pipeline for -calling the model - it may be not the ``forward`` method! In this -example, the model is a part of a ``predictor`` object and called as and -object, so we need to redefine the magic ``__call__`` method. - -.. code:: ipython3 - - class OVWrapper: - def __init__(self, ov_model, device="CPU", stride=32, ov_config=None) -> None: - ov_config = ov_config or {} - self.model = core.compile_model(ov_model, device, ov_config) - - self.stride = stride - self.pt = False - self.fp16 = False - self.names = {0: "object"} - - def __call__(self, im, **_): - result = self.model(im) - return torch.from_numpy(result[0]), torch.from_numpy(result[1]) - -Now we initialize the wrapper objects and load them to the FastSAM -pipeline. - -.. code:: ipython3 - - ov_config = {} - if "GPU" in device.value or ("AUTO" in device.value and "GPU" in core.available_devices): - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - - wrapped_model = OVWrapper( - ov_model_path, - device=device.value, - stride=model.predictor.model.stride, - ov_config=ov_config, - ) - model.predictor.model = wrapped_model - - ov_results = model(image_uri, device=device.value, retina_masks=True, imgsz=1024, conf=0.6, iou=0.9) - - -.. parsed-literal:: - - - image 1/1 /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/fast-segment-anything/coco_bike.jpg: 1024x1024 42 objects, 500.3ms - Speed: 7.8ms preprocess, 500.3ms inference, 39.1ms postprocess per image at shape (1, 3, 1024, 1024) - - -One can observe the converted model outputs in the next cell, they is -the same as of the original model. - -.. code:: ipython3 - - Image.fromarray(ov_results[0].plot()[..., ::-1]) - - - - -.. image:: fast-segment-anything-with-output_files/fast-segment-anything-with-output_21_0.png - - - -Optimize the model using NNCF Post-training Quantization API ------------------------------------------------------------- - - - -`NNCF `__ provides a suite of -advanced algorithms for Neural Networks inference optimization in -OpenVINO with minimal accuracy drop. We will use 8-bit quantization in -post-training mode (without the fine-tuning pipeline) to optimize -FastSAM. - -The optimization process contains the following steps: - -1. Create a Dataset for quantization. -2. Run ``nncf.quantize`` to obtain a quantized model. -3. Save the INT8 model using ``openvino.save_model()`` function. - -.. code:: ipython3 - - do_quantize = widgets.Checkbox( - value=True, - description="Quantization", - disabled=False, - ) - - do_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -The ``nncf.quantize`` function provides an interface for model -quantization. It requires an instance of the OpenVINO Model and -quantization dataset. Optionally, some additional parameters for the -configuration quantization process (number of samples for quantization, -preset, ignored scope, etc.) can be provided. YOLOv8 model backing -FastSAM contains non-ReLU activation functions, which require asymmetric -quantization of activations. To achieve a better result, we will use a -``mixed`` quantization preset. It provides symmetric quantization of -weights and asymmetric quantization of activations. For more accurate -results, we should keep the operation in the postprocessing subgraph in -floating point precision, using the ``ignored_scope`` parameter. - -The quantization algorithm is based on `The YOLOv8 quantization -example `__ -in the NNCF repo, refer there for more details. Moreover, you can check -out other quantization tutorials in the `OV notebooks -repo <-with-output.html>`__. - - **Note**: Model post-training quantization is time-consuming process. - Be patient, it can take several minutes depending on your hardware. - -.. code:: ipython3 - - %%skip not $do_quantize.value - - import pickle - from contextlib import contextmanager - from zipfile import ZipFile - - import cv2 - from tqdm.autonotebook import tqdm - - import nncf - - - COLLECT_CALIBRATION_DATA = False - calibration_data = [] - - @contextmanager - def calibration_data_collection(): - global COLLECT_CALIBRATION_DATA - try: - COLLECT_CALIBRATION_DATA = True - yield - finally: - COLLECT_CALIBRATION_DATA = False - - - class NNCFWrapper: - def __init__(self, ov_model, stride=32) -> None: - self.model = core.read_model(ov_model) - self.compiled_model = core.compile_model(self.model, device_name="CPU") - - self.stride = stride - self.pt = False - self.fp16 = False - self.names = {0: "object"} - - def __call__(self, im, **_): - if COLLECT_CALIBRATION_DATA: - calibration_data.append(im) - - result = self.compiled_model(im) - return torch.from_numpy(result[0]), torch.from_numpy(result[1]) - - # Fetch data from the web and descibe a dataloader - DATA_URL = "https://ultralytics.com/assets/coco128.zip" - OUT_DIR = Path('.') - - download_file(DATA_URL, directory=OUT_DIR, show_progress=True) - - if not (OUT_DIR / "coco128/images/train2017").exists(): - with ZipFile('coco128.zip', "r") as zip_ref: - zip_ref.extractall(OUT_DIR) - - class COCOLoader(torch.utils.data.Dataset): - def __init__(self, images_path): - self.images = list(Path(images_path).iterdir()) - - def __getitem__(self, index): - if isinstance(index, slice): - return [self.read_image(image_path) for image_path in self.images[index]] - return self.read_image(self.images[index]) - - def read_image(self, image_path): - image = cv2.imread(str(image_path)) - image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - return image - - def __len__(self): - return len(self.images) - - - def collect_calibration_data_for_decoder(model, calibration_dataset_size: int, - calibration_cache_path: Path): - global calibration_data - - - if not calibration_cache_path.exists(): - coco_dataset = COCOLoader(OUT_DIR / 'coco128/images/train2017') - with calibration_data_collection(): - for image in tqdm(coco_dataset[:calibration_dataset_size], desc="Collecting calibration data"): - model(image, retina_masks=True, imgsz=1024, conf=0.6, iou=0.9, verbose=False) - calibration_cache_path.parent.mkdir(parents=True, exist_ok=True) - with open(calibration_cache_path, "wb") as f: - pickle.dump(calibration_data, f) - else: - with open(calibration_cache_path, "rb") as f: - calibration_data = pickle.load(f) - - return calibration_data - - - def quantize(model, save_model_path: Path, calibration_cache_path: Path, - calibration_dataset_size: int, preset: nncf.QuantizationPreset): - calibration_data = collect_calibration_data_for_decoder( - model, calibration_dataset_size, calibration_cache_path) - quantized_ov_decoder = nncf.quantize( - model.predictor.model.model, - calibration_dataset=nncf.Dataset(calibration_data), - preset=preset, - subset_size=len(calibration_data), - fast_bias_correction=True, - ignored_scope=nncf.IgnoredScope( - types=["Multiply", "Subtract", "Sigmoid"], # ignore operations - names=[ - "__module.model.22.dfl.conv/aten::_convolution/Convolution", # in the post-processing subgraph - "__module.model.22/aten::add/Add", - "__module.model.22/aten::add/Add_1" - ], - ) - ) - ov.save_model(quantized_ov_decoder, save_model_path) - - wrapped_model = NNCFWrapper(ov_model_path, stride=model.predictor.model.stride) - model.predictor.model = wrapped_model - - calibration_dataset_size = 128 - quantized_model_path = Path(f"{model_name}_quantized") / "FastSAM-x.xml" - calibration_cache_path = Path(f"calibration_data/coco{calibration_dataset_size}.pkl") - if not quantized_model_path.exists(): - quantize(model, quantized_model_path, calibration_cache_path, - calibration_dataset_size=calibration_dataset_size, - preset=nncf.QuantizationPreset.MIXED) - - -.. parsed-literal:: - - :7: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - - -.. parsed-literal:: - - coco128.zip: 0%| | 0.00/6.66M [00:00`__. - -The app allows you to alter the model output interactively. Using the -Pixel selector type switch you can place foreground/background points or -bounding boxes on input image. - -.. code:: ipython3 - - import cv2 - import numpy as np - import matplotlib.pyplot as plt - - - def fast_process( - annotations, - image, - scale, - better_quality=False, - mask_random_color=True, - bbox=None, - use_retina=True, - with_contours=True, - ): - original_h = image.height - original_w = image.width - - if better_quality: - for i, mask in enumerate(annotations): - mask = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8)) - annotations[i] = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_OPEN, np.ones((8, 8), np.uint8)) - - inner_mask = fast_show_mask( - annotations, - plt.gca(), - random_color=mask_random_color, - bbox=bbox, - retinamask=use_retina, - target_height=original_h, - target_width=original_w, - ) - - if with_contours: - contour_all = [] - temp = np.zeros((original_h, original_w, 1)) - for i, mask in enumerate(annotations): - annotation = mask.astype(np.uint8) - if not use_retina: - annotation = cv2.resize( - annotation, - (original_w, original_h), - interpolation=cv2.INTER_NEAREST, - ) - contours, _ = cv2.findContours(annotation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - for contour in contours: - contour_all.append(contour) - cv2.drawContours(temp, contour_all, -1, (255, 255, 255), 2 // scale) - color = np.array([0 / 255, 0 / 255, 255 / 255, 0.9]) - contour_mask = temp / 255 * color.reshape(1, 1, -1) - - image = image.convert("RGBA") - overlay_inner = Image.fromarray((inner_mask * 255).astype(np.uint8), "RGBA") - image.paste(overlay_inner, (0, 0), overlay_inner) - - if with_contours: - overlay_contour = Image.fromarray((contour_mask * 255).astype(np.uint8), "RGBA") - image.paste(overlay_contour, (0, 0), overlay_contour) - - return image - - - # CPU post process - def fast_show_mask( - annotation, - ax, - random_color=False, - bbox=None, - retinamask=True, - target_height=960, - target_width=960, - ): - mask_sum = annotation.shape[0] - height = annotation.shape[1] - weight = annotation.shape[2] - # - areas = np.sum(annotation, axis=(1, 2)) - sorted_indices = np.argsort(areas)[::1] - annotation = annotation[sorted_indices] - - index = (annotation != 0).argmax(axis=0) - if random_color: - color = np.random.random((mask_sum, 1, 1, 3)) - else: - color = np.ones((mask_sum, 1, 1, 3)) * np.array([30 / 255, 144 / 255, 255 / 255]) - transparency = np.ones((mask_sum, 1, 1, 1)) * 0.6 - visual = np.concatenate([color, transparency], axis=-1) - mask_image = np.expand_dims(annotation, -1) * visual - - mask = np.zeros((height, weight, 4)) - - h_indices, w_indices = np.meshgrid(np.arange(height), np.arange(weight), indexing="ij") - indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None)) - - mask[h_indices, w_indices, :] = mask_image[indices] - if bbox is not None: - x1, y1, x2, y2 = bbox - ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor="b", linewidth=1)) - - if not retinamask: - mask = cv2.resize(mask, (target_width, target_height), interpolation=cv2.INTER_NEAREST) - - return mask - -This is the main callback function that is called to segment an image -based on user input. - -.. code:: ipython3 - - object_points = [] - background_points = [] - bbox_points = [] - - - def segment( - image, - model_type, - input_size=1024, - iou_threshold=0.75, - conf_threshold=0.4, - better_quality=True, - with_contours=True, - use_retina=True, - mask_random_color=True, - ): - if do_quantize.value and model_type == "Quantized model": - model.predictor.model = quantized_wrapped_model - else: - model.predictor.model = wrapped_model - - input_size = int(input_size) - w, h = image.size - scale = input_size / max(w, h) - new_w = int(w * scale) - new_h = int(h * scale) - image = image.resize((new_w, new_h)) - - results = model( - image, - retina_masks=use_retina, - iou=iou_threshold, - conf=conf_threshold, - imgsz=input_size, - ) - - masks = results[0].masks.data - # Calculate annotations - if not (object_points or bbox_points): - annotations = masks.cpu().numpy() - else: - annotations = [] - - if object_points: - all_points = object_points + background_points - labels = [1] * len(object_points) + [0] * len(background_points) - scaled_points = [[int(x * scale) for x in point] for point in all_points] - h, w = masks[0].shape[:2] - assert max(h, w) == input_size - onemask = np.zeros((h, w)) - for mask in sorted(masks, key=lambda x: x.sum(), reverse=True): - mask_np = (mask == 1.0).cpu().numpy() - for point, label in zip(scaled_points, labels): - if mask_np[point[1], point[0]] == 1 and label == 1: - onemask[mask_np] = 1 - if mask_np[point[1], point[0]] == 1 and label == 0: - onemask[mask_np] = 0 - annotations.append(onemask >= 1) - if len(bbox_points) >= 2: - scaled_bbox_points = [] - for i, point in enumerate(bbox_points): - x, y = int(point[0] * scale), int(point[1] * scale) - x = max(min(x, new_w), 0) - y = max(min(y, new_h), 0) - scaled_bbox_points.append((x, y)) - - for i in range(0, len(scaled_bbox_points) - 1, 2): - x0, y0, x1, y1 = *scaled_bbox_points[i], *scaled_bbox_points[i + 1] - - intersection_area = torch.sum(masks[:, y0:y1, x0:x1], dim=(1, 2)) - masks_area = torch.sum(masks, dim=(1, 2)) - bbox_area = (y1 - y0) * (x1 - x0) - - union = bbox_area + masks_area - intersection_area - iou = intersection_area / union - max_iou_index = torch.argmax(iou) - - annotations.append(masks[max_iou_index].cpu().numpy()) - - return fast_process( - annotations=np.array(annotations), - image=image, - scale=(1024 // input_size), - better_quality=better_quality, - mask_random_color=mask_random_color, - bbox=None, - use_retina=use_retina, - with_contours=with_contours, - ) - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/fast-segment-anything/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=segment, quantized=do_quantize.value) - - try: - demo.queue().launch(debug=False) - except Exception: - demo.queue().launch(share=True, debug=False) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name="your server name", server_port="server port in int")` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ - - -.. parsed-literal:: - - Running on local URL: http://127.0.0.1:7860 - - To create a public link, set `share=True` in `launch()`. - - - - - - - diff --git a/docs/notebooks/fast-segment-anything-with-output_files/fast-segment-anything-with-output_21_0.jpg b/docs/notebooks/fast-segment-anything-with-output_files/fast-segment-anything-with-output_21_0.jpg deleted file mode 100644 index d5eefa7f4ba745..00000000000000 --- a/docs/notebooks/fast-segment-anything-with-output_files/fast-segment-anything-with-output_21_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:28474c1a5b3bfc63d893b2a4d2878d7368e439d085eb34b7dabafa4d1664743c -size 128684 diff --git a/docs/notebooks/fast-segment-anything-with-output_files/fast-segment-anything-with-output_21_0.png b/docs/notebooks/fast-segment-anything-with-output_files/fast-segment-anything-with-output_21_0.png deleted file mode 100644 index 2b1e89c6367a3e..00000000000000 --- a/docs/notebooks/fast-segment-anything-with-output_files/fast-segment-anything-with-output_21_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2bbff9cdd0b3cf370807c947b57f0a8e115cafb63b147143be3a9ca4aa2830a0 -size 804161 diff --git a/docs/notebooks/fast-segment-anything-with-output_files/fast-segment-anything-with-output_9_0.jpg b/docs/notebooks/fast-segment-anything-with-output_files/fast-segment-anything-with-output_9_0.jpg deleted file mode 100644 index 08b675f56c103e..00000000000000 --- a/docs/notebooks/fast-segment-anything-with-output_files/fast-segment-anything-with-output_9_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a4cd201b4359b4fda32f79ca462c0bcb044864067c76d84be01a879653ec45db -size 125898 diff --git a/docs/notebooks/fast-segment-anything-with-output_files/fast-segment-anything-with-output_9_0.png b/docs/notebooks/fast-segment-anything-with-output_files/fast-segment-anything-with-output_9_0.png deleted file mode 100644 index 1e74ddfa4f2a2d..00000000000000 --- a/docs/notebooks/fast-segment-anything-with-output_files/fast-segment-anything-with-output_9_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:27434cfd4bbee84cbc6908653a63a864396fcad00b1a045a381364856f939b00 -size 819795 diff --git a/docs/notebooks/film-slowmo-with-output.rst b/docs/notebooks/film-slowmo-with-output.rst deleted file mode 100644 index 9335ea66fae683..00000000000000 --- a/docs/notebooks/film-slowmo-with-output.rst +++ /dev/null @@ -1,523 +0,0 @@ -Frame interpolation using FILM and OpenVINO -=========================================== - -`Frame -interpolation `__ is -the process of synthesizing in-between images from a given set of -images. The technique is often used for `temporal -up-sampling `__ -to increase the refresh rate of videos or to create slow motion effects. -Nowadays, with digital cameras and smartphones, we often take several -photos within a few seconds to capture the best picture. Interpolating -between these “near-duplicate” photos can lead to engaging videos that -reveal scene motion, often delivering an even more pleasing sense of the -moment than the original photos. - -In `“FILM: Frame Interpolation for Large -Motion” `__, published at ECCV -2022, a method to create high quality slow-motion videos from -near-duplicate photos is presented. FILM is a new neural network -architecture that achieves state-of-the-art results in large motion, -while also handling smaller motions well. - -The FILM model takes two images as input and outputs a middle image. At -inference time, the model is recursively invoked to output in-between -images. FILM has three components: 1. Feature extractor that summarizes -each input image with deep multi-scale (pyramid) features; 2. -Bi-directional motion estimator that computes pixel-wise motion (i.e., -flows) at each pyramid level; 3. Fusion module that outputs the final -interpolated image. - -FILM is trained on regular video frame triplets, with the middle frame -serving as the ground-truth for supervision. - -In this tutorial, we will use `TensorFlow Hub `__ as -a model source. - - **NOTE**: To run this tutorial, your system is required to have a VP9 - video encoder. Ubuntu has it preinstalled, but for Windows, you - should install it manually. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Prepare images <#prepare-images>`__ -- `Load the model <#load-the-model>`__ -- `Infer the model <#infer-the-model>`__ - - - `Single middle frame - interpolation <#single-middle-frame-interpolation>`__ - - `Recursive frame generation <#recursive-frame-generation>`__ - -- `Convert the model to OpenVINO - IR <#convert-the-model-to-openvino-ir>`__ -- `Inference <#inference>`__ - - - `Select inference device <#select-inference-device>`__ - - `Single middle frame - interpolation <#single-middle-frame-interpolation>`__ - - `Recursive frame generation <#recursive-frame-generation>`__ - -- `Interactive inference <#interactive-inference>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import os - - %pip install -q "tensorflow-macos>=2.5; sys_platform == 'darwin' and platform_machine == 'arm64' and python_version > '3.8'" # macOS M1 and M2 - %pip install -q "tensorflow-macos>=2.5,<=2.12.0; sys_platform == 'darwin' and platform_machine == 'arm64' and python_version <= '3.8'" # macOS M1 and M2 - %pip install -q "tensorflow>=2.5; sys_platform == 'darwin' and platform_machine != 'arm64' and python_version > '3.8'" # macOS x86 - %pip install -q "tensorflow>=2.5,<=2.12.0; sys_platform == 'darwin' and platform_machine != 'arm64' and python_version <= '3.8'" # macOS x86 - %pip install -q "tensorflow>=2.5; sys_platform != 'darwin' and python_version > '3.8'" - %pip install -q "tensorflow>=2.5,<=2.12.0; sys_platform != 'darwin' and python_version <= '3.8'" - - %pip install -q --no-deps "tensorflow-hub" - %pip install -q tf_keras numpy "opencv-python" tqdm "gradio>=4.19" Pillow "openvino>=2023.2.0" - - %pip install -q "matplotlib>=3.4" - -.. code:: ipython3 - - from pathlib import Path - import requests - from typing import Optional, Generator - from datetime import datetime - import gc - - os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" - os.environ["TF_USE_LEGACY_KERAS"] = "1" - os.environ["TFHUB_CACHE_DIR"] = str(Path("./tfhub_modules").resolve()) - - import tensorflow_hub as hub - import tensorflow as tf - import openvino as ov - import numpy as np - import cv2 - import matplotlib.pyplot as plt - from tqdm.auto import tqdm - import gradio as gr - import PIL - import IPython - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("film-slowmo.ipynb") - -.. code:: ipython3 - - MODEL_PATH = Path("models/model.xml") - DATA_PATH = Path("data") - IMAGES = { - "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/c3ddf65f-95ec-44ca-9ed4-3ef2d8f4b47e": Path("data/one.jpg"), - "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/6d21f1ce-69eb-41b5-aedd-6e3c29013b30": Path("data/two.jpg"), - } - OUTPUT_VIDEO_PATH = DATA_PATH / "output.webm" - OV_OUTPUT_VIDEO_PATH = DATA_PATH / "ov_output.webm" - TIMES_TO_INTERPOLATE = 5 - DATA_PATH.mkdir(parents=True, exist_ok=True) - - PIL.ImageFile.LOAD_TRUNCATED_IMAGES = True # allows Gradio to read PNG images with large metadata - -Prepare images --------------- - - - -Download images and cast them to NumPy arrays to provide as model -inputs. - -.. code:: ipython3 - - def preprocess_np_frame(frame): - result = frame.astype(np.float32) / 255 # normalize to [0, 1] - result = result[np.newaxis, ...] # add batch dim - return result - - - def prepare_input(img_url: str): - if not IMAGES[img_url].exists(): - r = requests.get(img_url) - with IMAGES[img_url].open("wb") as f: - f.write(r.content) - filename = str(IMAGES[img_url]) - img = cv2.imread(filename) - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - img = np.array(img) - img = preprocess_np_frame(img) - - return img - - - input_images = [prepare_input(url) for url in IMAGES] - - input = { - "x0": input_images[0], - "x1": input_images[1], - "time": np.array([[0.5]], dtype=np.float32), - } - -.. code:: ipython3 - - plt.figure(figsize=(16, 8), layout="tight") - plt.subplot(1, 2, 1) - plt.imshow(input_images[0][0]) - plt.axis("off") - plt.subplot(1, 2, 2) - plt.imshow(input_images[1][0]) - plt.axis("off"); - - - -.. image:: film-slowmo-with-output_files/film-slowmo-with-output_7_0.png - - -Load the model --------------- - - - -Model is loaded using ``tensorflow_hub.KerasLayer`` function. Then, we -specify shapes of input tensors to cast loaded object to -``tf.keras.Model`` class. - -Input tensors are: - ``time`` - value between :math:`[0,1]` that says -where the generated image should be. :math:`0.5` is midway between the -input images. - ``x0`` - initial frame. - ``x1`` - final frame. - -For more details, see `model page on TensorFlow -Hub `__. - -.. code:: ipython3 - - inputs = dict( - x0=tf.keras.layers.Input(shape=(None, None, 3)), - x1=tf.keras.layers.Input(shape=(None, None, 3)), - time=tf.keras.layers.Input(shape=(1)), - ) - model_url = "https://www.kaggle.com/models/google/film/frameworks/tensorFlow2/variations/film/versions/1" - film_layer = hub.KerasLayer(model_url)(inputs) - film_model = tf.keras.Model(inputs=inputs, outputs=film_layer) - -Infer the model ---------------- - - - -Single middle frame interpolation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - output = film_model(input) - interpolated_image = output["image"][0] - interpolated_image = np.clip(interpolated_image, 0, 1) - -.. code:: ipython3 - - def draw(img1, mid_img, img2): - title2img = {"First frame": img1, "Interpolated frame": mid_img, "Last frame": img2} - plt.figure(figsize=(16, 8), layout="tight") - for i, (title, img) in enumerate(title2img.items()): - ax = plt.subplot(1, 3, i + 1) - ax.set_title(title) - plt.imshow(img) - plt.axis("off") - -.. code:: ipython3 - - draw(input_images[0][0], interpolated_image, input_images[1][0]) - - - -.. image:: film-slowmo-with-output_files/film-slowmo-with-output_14_0.png - - -Recursive frame generation -~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The process will take as input 2 original frames (first and last) and -generate a midpoint frame. Then, it will repeat itself for pairs “first -- midpoint”, “midpoint - last” to provide midpoints for them, and so on. -Recursion is executed :math:`t=` ``times_to_interpolate`` times -generating :math:`2^t-1` images. - -.. code:: ipython3 - - class Interpolator: - def __init__(self, model): - self._model = model - - def _recursive_generator( - self, - frame1: np.ndarray, - frame2: np.ndarray, - num_recursions: int, - bar: Optional[tqdm] = None, - ) -> Generator[np.ndarray, None, None]: - """Splits halfway to repeatedly generate more frames. - - Args: - frame1: Input image 1. - frame2: Input image 2. - num_recursions: How many times to interpolate the consecutive image pairs. - - Yields: - The interpolated frames, including the first frame (frame1), but excluding - the final frame2. - """ - if num_recursions == 0: - yield frame1 - else: - time = np.array([[0.5]], dtype=np.float32) - mid_frame = self._model({"x0": frame1, "x1": frame2, "time": time})["image"] - if bar is not None: - bar.update(1) - yield from self._recursive_generator(frame1, mid_frame, num_recursions - 1, bar) - yield from self._recursive_generator(mid_frame, frame2, num_recursions - 1, bar) - - def interpolate_recursively(self, frame1: np.ndarray, frame2: np.ndarray, times_to_interpolate: int) -> Generator[np.ndarray, None, None]: - """Generates interpolated frames by repeatedly interpolating the midpoint. - - Args: - frame1: Input image 1. - frame2: Input image 2. - times_to_interpolate: Number of times to do recursive midpoint - interpolation. - - Yields: - The interpolated frames (including the inputs). - """ - num_frames = 2 ** (times_to_interpolate) - 1 - bar = tqdm(total=num_frames) - yield from self._recursive_generator(frame1, frame2, times_to_interpolate, bar) - # Separately yield the final frame. - yield frame2 - -.. code:: ipython3 - - def save_as_video(frames: Generator[np.ndarray, None, None], width: int, height: int, filename: Path): - out = cv2.VideoWriter(str(filename), cv2.VideoWriter_fourcc(*"VP90"), 30, (width, height)) - for frame in frames: - img = frame[0] - img = np.clip(img, 0, 1) - rgb_img = img * 255 - rgb_img = rgb_img.astype(np.uint8) - bgr_img = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2BGR) - out.write(bgr_img) - out.release() - -.. code:: ipython3 - - height, width = input_images[0][0].shape[:2] - interpolator = Interpolator(film_model) - frames = interpolator.interpolate_recursively(input_images[0], input_images[1], 5) - save_as_video(frames, width, height, OUTPUT_VIDEO_PATH) - - -.. parsed-literal:: - - OpenCV: FFMPEG: tag 0x30395056/'VP90' is not supported with codec id 167 and format 'webm / WebM' - - - -.. parsed-literal:: - - 0%| | 0/31 [00:00 - - Your browser does not support the video tag. - - - - -Convert the model to OpenVINO IR --------------------------------- - - - -To convert a TensorFlow Keras Model to OpenVINO Intermediate -Representation (IR), call the ``openvino.convert_model()`` function and -pass the model as the only argument. You can then serialize the model -object to disk using the ``openvino.save_model()`` function. - -.. code:: ipython3 - - if not MODEL_PATH.exists(): - converted_model = ov.convert_model(film_model) - ov.save_model(converted_model, MODEL_PATH) - del converted_model - del film_model - gc.collect(); - -Inference ---------- - - - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - core = ov.Core() - device = device_widget() - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - compiled_model = core.compile_model(MODEL_PATH, device.value) - -Single middle frame interpolation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Model output has multiple tensors, including auxiliary inference data. -The main output tensor - interpolated image - is stored at “image” key. - -.. code:: ipython3 - - result = compiled_model(input)["image"] - image = result[0] - image = np.clip(image, 0, 1) - -Model returned intermediate image. Let’s see what it is. - -.. code:: ipython3 - - draw(input_images[0][0], image, input_images[1][0]) - - - -.. image:: film-slowmo-with-output_files/film-slowmo-with-output_29_0.png - - -Recursive frame generation -~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Now let’s create a smooth video by recursively generating frames between -initial, middle and final images. - -.. code:: ipython3 - - height, width = input_images[0][0].shape[:2] - ov_interpolator = Interpolator(compiled_model) - frames = ov_interpolator.interpolate_recursively(input_images[0], input_images[1], TIMES_TO_INTERPOLATE) - save_as_video(frames, width, height, OV_OUTPUT_VIDEO_PATH) - - -.. parsed-literal:: - - OpenCV: FFMPEG: tag 0x30395056/'VP90' is not supported with codec id 167 and format 'webm / WebM' - - - -.. parsed-literal:: - - 0%| | 0/31 [00:00 - - Your browser does not support the video tag. - - - - -Interactive inference ---------------------- - - - -.. code:: ipython3 - - def generate(frame1, frame2, times_to_interpolate, _=gr.Progress(track_tqdm=True)): - x0, x1 = [preprocess_np_frame(frame) for frame in [frame1, frame2]] - frames = ov_interpolator.interpolate_recursively(x0, x1, times_to_interpolate) - height, width = frame1.shape[:2] - filename = DATA_PATH / f"output_{datetime.now().isoformat()}.webm" - save_as_video(frames, width, height, filename) - return filename - - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/film-slowmo/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=generate) - - try: - demo.queue().launch(debug=False) - except Exception: - demo.queue().launch(share=True, debug=False) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/film-slowmo-with-output_files/film-slowmo-with-output_14_0.png b/docs/notebooks/film-slowmo-with-output_files/film-slowmo-with-output_14_0.png deleted file mode 100644 index 4ccfd95299d2f4..00000000000000 --- a/docs/notebooks/film-slowmo-with-output_files/film-slowmo-with-output_14_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4cfb9fec7a17248d0934e5f9b83b9b86eaa175535901064744bfdf5dfba2626d -size 466103 diff --git a/docs/notebooks/film-slowmo-with-output_files/film-slowmo-with-output_29_0.png b/docs/notebooks/film-slowmo-with-output_files/film-slowmo-with-output_29_0.png deleted file mode 100644 index 234a5ae52cf0f7..00000000000000 --- a/docs/notebooks/film-slowmo-with-output_files/film-slowmo-with-output_29_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f8ccc0d255c0d77a422d4a84d5864b5cff4a415b97a78555f254e74dae9bf54e -size 465989 diff --git a/docs/notebooks/film-slowmo-with-output_files/film-slowmo-with-output_7_0.png b/docs/notebooks/film-slowmo-with-output_files/film-slowmo-with-output_7_0.png deleted file mode 100644 index 5930e6fd40b355..00000000000000 --- a/docs/notebooks/film-slowmo-with-output_files/film-slowmo-with-output_7_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ab82d808cf872d2714801a251bb8b3173c12ca1cec12474a07e04aeecaa7f177 -size 484833 diff --git a/docs/notebooks/florence2-with-output.rst b/docs/notebooks/florence2-with-output.rst deleted file mode 100644 index 6ca7e0e6ad43e7..00000000000000 --- a/docs/notebooks/florence2-with-output.rst +++ /dev/null @@ -1,645 +0,0 @@ -Florence-2: Open Source Vision Foundation Model -=============================================== - -Florence-2 is a lightweight vision-language foundation model developed -by Microsoft Azure AI and open-sourced under the MIT license. It aims to -achieve a unified, prompt-based representation for diverse vision and -vision-language tasks, including captioning, object detection, -grounding, and segmentation. Despite its compact size, Florence-2 rivals -much larger models like -`Kosmos-2 `__ -in performance. Florence-2 represents a significant advancement in -vision-language models by combining lightweight architecture with robust -capabilities, making it highly accessible and versatile. Its unified -representation approach, supported by the extensive FLD-5B dataset, -enables it to excel in multiple vision tasks without the need for -separate models. This efficiency makes Florence-2 a strong contender for -real-world applications, particularly on devices with limited resources. - -More details about model can be found in `model’s resources -collection `__ -and original `paper `__ - -In this tutorial we consider how to convert and run Florence2 using -OpenVINO. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Select model <#select-model>`__ -- `Convert model <#convert-model>`__ -- `Select inference device <#select-inference-device>`__ -- `Run model inference <#run-model-inference>`__ -- `Interactive Demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "einops" "torch>2.1" "torchvision" "matplotlib>=3.4" "timm>=0.9.8" "transformers>=4.41" "pillow" "gradio>=4.19" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q -U --pre "openvino>=2024.6" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - ERROR: Could not find a version that satisfies the requirement openvino>=2024.6 (from versions: 2021.3.0, 2021.4.0, 2021.4.1, 2021.4.2, 2022.1.0, 2022.2.0, 2022.3.0, 2022.3.1, 2022.3.2, 2023.0.0.dev20230119, 2023.0.0.dev20230217, 2023.0.0.dev20230407, 2023.0.0.dev20230427, 2023.0.0, 2023.0.1, 2023.0.2, 2023.1.0.dev20230623, 2023.1.0.dev20230728, 2023.1.0.dev20230811, 2023.1.0, 2023.2.0.dev20230922, 2023.2.0, 2023.3.0, 2024.0.0.dev20240215, 2024.0.0rc2, 2024.0.0, 2024.1.0rc2, 2024.1.0, 2024.2.0rc1, 2024.2.0rc2, 2024.2.0, 2024.3.0.dev20240807, 2024.3.0rc1, 2024.3.0rc2, 2024.3.0, 2024.4.0rc1, 2024.4.0rc2, 2024.4.0, 2024.4.1.dev20240926, 2024.4.1rc1) - ERROR: No matching distribution found for openvino>=2024.6 - Note: you may need to restart the kernel to use updated packages. - - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("ov_florence2_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/florence2/ov_florence2_helper.py") - open("ov_florence2_helper.py", "w").write(r.text) - - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/florence2/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - if not Path("notebook_utils.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("florence2.ipynb") - -Select model ------------- - - - -The Florence-2 series consists of two models: Florence-2-base and -Florence-2-large, with 0.23 billion and 0.77 billion parameters, -respectively. Additionally, authors provide finetuned on collection of -downstream tasks model versions. In this tutorial you can select one of -available model. By default, we will use -`Florence-2-base-ft `__. - -.. code:: ipython3 - - from ov_florence2_helper import convert_florence2, get_model_selector - - model_selector = get_model_selector() - - model_selector - - -.. parsed-literal:: - - 2025-02-04 02:29:03.827716: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2025-02-04 02:29:03.861899: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2025-02-04 02:29:04.528243: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - - - - -.. parsed-literal:: - - Dropdown(description='Model:', options=('microsoft/Florence-2-base-ft', 'microsoft/Florence-2-base', 'microsof… - - - -Convert model -------------- - - - -Florence2 is PyTorch model. OpenVINO supports PyTorch models via -conversion to OpenVINO Intermediate Representation (IR). `OpenVINO model -conversion -API `__ -should be used for these purposes. ``ov.convert_model`` function accepts -original PyTorch model instance and example input for tracing and -returns ``ov.Model`` representing this model in OpenVINO framework. -Converted model can be used for saving on disk using ``ov.save_model`` -function or directly loading on device using ``core.complie_model``. -``ov_florence2_helper.py`` script contains helper function for model -conversion, please check its content if you interested in conversion -details. - -.. raw:: html - -
- -Click here for more detailed explanation of conversion steps The model -takes images and task prompts as input, generating the desired results -in text format. It uses a DaViT vision encoder to convert images into -visual token embeddings. These are then concatenated with BERT-generated -text embeddings and processed by a transformer-based multi-modal -encoder-decoder to generate the response. - -|image0| - -To sum up above, model consists of 4 parts: - -- **Image Encoder** for transforming input images into flattened visual - token embeddings. -- **Input Embedding** for conversion input text tokens or task - description into embedding space. -- **Encoder** and **Decoder** for generation answer based on input - embeddings provided by Image Encoder and Input Embedding models. The - model employs a seq2seq framework, seamlessly integrating the image - encoder with a multi-modality encoder-decoder. - -We will convert each part separately, then combine them in inference -pipeline. - -.. raw:: html - -
- -.. |image0| image:: https://blog.roboflow.com/content/images/2024/06/Screenshot-2024-06-19-at-22.34.35-1-Medium.jpeg - -.. code:: ipython3 - - model_id = model_selector.value - model_path = Path(model_id.split("/")[-1]) - - # Uncomment the line to see conversion code - # ??convert_florence2 - -.. code:: ipython3 - - convert_florence2(model_id, model_path) - - -.. parsed-literal:: - - ⌛ microsoft/Florence-2-base-ft conversion started. Be patient, it may takes some time. - ⌛ Load Original model - - - -.. parsed-literal:: - - Fetching 15 files: 0%| | 0/15 [00:00 0 or pad_b > 0: - /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/chkpt/modeling_florence2.py:349: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - q = q * (float(N) ** -0.5) - /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/chkpt/modeling_florence2.py:2610: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - h, w = int(num_tokens ** 0.5), int(num_tokens ** 0.5) - /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/chkpt/modeling_florence2.py:2611: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert h * w == num_tokens, 'only support square feature maps for now' - /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/chkpt/modeling_florence2.py:151: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert len_seq <= self.max_seq_len - - -.. parsed-literal:: - - ✅ Image Embeddings successfuly converted - ⌛ Text Embedding conversion started - ✅ Text Embedding conversion started - ⌛ Encoder conversion started - - -.. parsed-literal:: - - /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/chkpt/modeling_florence2.py:1218: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim): - - -.. parsed-literal:: - - ✅ Encoder conversion finished - ⌛ Decoder conversion started - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:88: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if input_shape[-1] > 1 or self.sliding_window is not None: - /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/chkpt/modeling_florence2.py:1205: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False - /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/chkpt/modeling_florence2.py:1167: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if ( - - -.. parsed-literal:: - - ✅ Decoder conversion finished - ✅ microsoft/Florence-2-base-ft already converted and can be found in Florence-2-base-ft - - -Select inference device ------------------------ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Run model inference -------------------- - - - -``OvFlorence2Model`` class defined in ``ov_florence2_helper.py`` -provides convenient way for running model. It accepts directory with -converted model and inference device as arguments. For running model we -will use ``generate`` method. - -.. code:: ipython3 - - from ov_florence2_helper import OVFlorence2Model - - # Uncomment the line to see model class code - # ??OVFlorence2Model - -.. code:: ipython3 - - model = OVFlorence2Model(model_path, device.value) - -Additionally, for model usage we also need ``Processor`` class, that -distributed with original model and can be loaded using -``AutoProcessor`` from ``transformers`` library. Processor is -responsible for input data preparation and decoding model output. - -.. code:: ipython3 - - import requests - from PIL import Image - - from transformers import AutoProcessor - - processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) - - prompt = "" - - url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true" - image = Image.open(requests.get(url, stream=True).raw) - - image - - - - -.. image:: florence2-with-output_files/florence2-with-output_15_0.png - - - -Let’s check model capabilities in Object Detection. - -.. code:: ipython3 - - inputs = processor(text=prompt, images=image, return_tensors="pt") - - generated_ids = model.generate(input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, do_sample=False, num_beams=3) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] - - parsed_answer = processor.post_process_generation(generated_text, task="", image_size=(image.width, image.height)) - -.. code:: ipython3 - - from gradio_helper import plot_bbox - - fig = plot_bbox(image, parsed_answer[""]) - - - -.. image:: florence2-with-output_files/florence2-with-output_18_0.png - - -More model capabilities will be demonstrated in interactive demo. - -Interactive Demo ----------------- - - - -In this section, you can see model in action on various of supported -vision tasks. Please provide input image or select one from examples and -specify task (Please note, that some of them may additionally requires -to provide text input, e.g. description for region for segmentation or -phrase for grounding). - -.. raw:: html - -
- -Click here for more detailed info about supported tasks Florence-2 is -designed to handle a variety of vision and vision-language tasks through -its unified, prompt-based representation. The key vision tasks performed -by Florence-2 include: - -.. raw:: html - -
    - -.. raw:: html - -
  • - -Caption: Generating brief textual descriptions of images, capturing the -essence of the scene. - -.. raw:: html - -
  • - -.. raw:: html - -
  • - -Detailed Caption: Producing more elaborate textual descriptions, -providing richer information about the image. - -.. raw:: html - -
  • - -.. raw:: html - -
  • - -More Detailed Caption: Creating comprehensive textual descriptions that -include extensive details about the image. - -.. raw:: html - -
  • - -.. raw:: html - -
  • - -Region Proposal: Identifying regions of interest within an image to -focus on specific areas. - -.. raw:: html - -
  • - -.. raw:: html - -
  • - -Object Detection: Locating and identifying objects within an image, -providing bounding boxes and labels for each detected object. - -.. raw:: html - -
  • - -.. raw:: html - -
  • - -Dense Region Caption: Generating textual descriptions for densely packed -regions within an image. - -.. raw:: html - -
  • - -.. raw:: html - -
  • - -Phrase Grounding: Associating phrases in a text description with -specific regions in an image, linking textual descriptions to visual -elements. - -.. raw:: html - -
  • - -.. raw:: html - -
  • - -Referring Expression Segmentation: Identifying regions in an image that -correspond to natural language expressions, making it adept at tasks -that require fine-grained visual-textual alignment.Segmenting regions in -an image based on referring expressions, providing detailed object -boundaries. - -.. raw:: html - -
  • - -.. raw:: html - -
  • - -Open Vocabulary Detection: Detecting objects in an image using a -flexible and extensive vocabulary. - -.. raw:: html - -
  • - -.. raw:: html - -
  • - -Region to Text: Converting regions of an image into corresponding -textual descriptions. - -.. raw:: html - -
  • - -.. raw:: html - -
  • - -Text Detection and Recognition: Detecting and recognizing text within an -image, providing both text and region information. - -.. raw:: html - -
  • - -.. raw:: html - -
- -.. raw:: html - -
- -.. code:: ipython3 - - from gradio_helper import make_demo - - demo = make_demo(model, processor) - - try: - demo.launch(debug=False, height=600) - except Exception: - demo.launch(debug=False, share=True, height=600) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ - - -.. parsed-literal:: - - Running on local URL: http://127.0.0.1:7860 - - To create a public link, set `share=True` in `launch()`. - - - - - - - diff --git a/docs/notebooks/florence2-with-output_files/florence2-with-output_15_0.jpg b/docs/notebooks/florence2-with-output_files/florence2-with-output_15_0.jpg deleted file mode 100644 index 1e60f7530798c8..00000000000000 --- a/docs/notebooks/florence2-with-output_files/florence2-with-output_15_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2f9a4b19e13babaf8e0233efda137d171b295890a61f5d7e346e30e0ace42662 -size 40990 diff --git a/docs/notebooks/florence2-with-output_files/florence2-with-output_15_0.png b/docs/notebooks/florence2-with-output_files/florence2-with-output_15_0.png deleted file mode 100644 index 3922a958dc5eda..00000000000000 --- a/docs/notebooks/florence2-with-output_files/florence2-with-output_15_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d5d12d13a96b958b56d21fef0d87fee9736d7a721ee99c5adab60493f66a9fc1 -size 394039 diff --git a/docs/notebooks/florence2-with-output_files/florence2-with-output_18_0.png b/docs/notebooks/florence2-with-output_files/florence2-with-output_18_0.png deleted file mode 100644 index c233468fe95f4e..00000000000000 --- a/docs/notebooks/florence2-with-output_files/florence2-with-output_18_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d15ed97d6e50919caff2aee785bc4c90f91dcfcc9bb248f70e9d79bb203be64f -size 259663 diff --git a/docs/notebooks/flux.1-image-generation-with-output.rst b/docs/notebooks/flux.1-image-generation-with-output.rst deleted file mode 100644 index 01911a771794f0..00000000000000 --- a/docs/notebooks/flux.1-image-generation-with-output.rst +++ /dev/null @@ -1,387 +0,0 @@ -Image generation with Flux.1 and OpenVINO -========================================= - -Flux is a AI image generation model developed by `Black Forest -Labs `__. It represents a -significant advancement in AI-generated art, utilizing a hybrid -architecture of `multimodal `__ and -`parallel `__ `diffusion -transformer `__ blocks and scaled to -12B parameter. The model offers state-of-the-art performance image -generation with top of the line prompt following, visual quality, image -detail and output diversity.More details about model can be found in -`blog post `__ -and `original repo `__ - -In this tutorial we consider how to convert and optimized Flux.1 model -using OpenVINO. - - **Note**: Some demonstrated models can require at least 32GB RAM for - conversion and running. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Select model <#select-model>`__ -- `Convert model with OpenVINO <#convert-model-with-openvino>`__ - - - `Convert model using Optimum - Intel <#convert-model-using-optimum-intel>`__ - - `Compress model weights <#compress-model-weights>`__ - -- `Run OpenVINO model inference <#run-openvino-model-inference>`__ -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "gradio>=4.19" "torch>=2.1" "transformers" "nncf>=2.12.0" "diffusers>=0.31.0" "opencv-python" "pillow" "peft>=0.7.0" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "sentencepiece" "protobuf" - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" - %pip install -qU "openvino>=2024.4.0" - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("cmd_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py") - open("cmd_helper.py", "w").write(r.text) - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/flux.1-image-generation/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - if not Path("notebook_utils.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("flux.1-image-generation.ipynb") - -Select model ------------- - - - -To strike a balance between accessibility and model capabilities, FLUX.1 -comes in three variants: FLUX.1-pro, FLUX.1-dev and FLUX.1-schnell: \* -**FLUX.1-pro**: The best of FLUX.1, offering state-of-the-art -performance image generation with top of the line prompt following, -visual quality, image detail and output diversity, but not available for -public usage. \* **FLUX.1-dev**: FLUX.1-dev is an open-weight, -guidance-distilled models. Directly distilled from FLUX.1-pro, -FLUX.1-dev obtains similar quality and prompt adherence capabilities, -while being more efficient than a standard model of the same size. -FLUX.1-dev weights are available on -`HuggingFace `__. -\* **FLUX.1-schnell**: the fastest model from Flux family is tailored -for local development and personal use. FLUX.1-schnell is openly -available under an Apache2.0 license. Similar, FLUX.1-dev, weights are -available on -`HuggingFace `__. - -.. figure:: https://github.com/user-attachments/assets/c7f9df6b-cff3-4d33-98d7-1bb400b2861c - :alt: family.png - - family.png - -Be default, we will use FLUX.1-schnell model, but you can switch to -FLUX.1-dev version using widget bellow. - -.. code:: ipython3 - - import ipywidgets as widgets - - model_ids = ["black-forest-labs/FLUX.1-schnell", "black-forest-labs/FLUX.1-dev"] - - model_selector = widgets.Dropdown( - options=model_ids, - default=model_ids[0], - description="Model:", - ) - - - model_selector - - - - -.. parsed-literal:: - - Dropdown(description='Model:', options=('black-forest-labs/FLUX.1-schnell', 'black-forest-labs/FLUX.1-dev'), v… - - - - **Note**: run Flux.1-dev model with notebook, you will need to accept - license agreement. You must be a registered user in Hugging Face - Hub. Please visit `HuggingFace model - card `__, - carefully read terms of usage and click accept button. You will need - to use an access token for the code below to run. For more - information on access tokens, refer to `this section of the - documentation `__. - You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: ipython3 - - # uncomment these lines to login to huggingfacehub to get access to pretrained model - - # from huggingface_hub import notebook_login, whoami - - # try: - # whoami() - # print('Authorization token already provided') - # except OSError: - # notebook_login() - -Convert model with OpenVINO ---------------------------- - - - -Starting from 2023.0 release, OpenVINO supports PyTorch models directly -via Model Conversion API. ``ov.convert_model`` function accepts instance -of PyTorch model and example inputs for tracing and returns object of -``ov.Model`` class, ready to use or save on disk using ``ov.save_model`` -function. - -The pipeline consists of four important parts: - -- Clip and T5 Text Encoders to create condition to generate an image - from a text prompt. -- Transformer for step-by-step denoising latent image representation. -- Autoencoder (VAE) for decoding latent space to image. - -Convert model using Optimum Intel -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -For convenience, we will use OpenVINO integration with HuggingFace -Optimum. `Optimum -Intel `__ is the -interface between the Transformers and Diffusers libraries and the -different tools and libraries provided by Intel to accelerate end-to-end -pipelines on Intel architectures. - -Among other use cases, Optimum Intel provides a simple interface to -optimize your Transformers and Diffusers models, convert them to the -OpenVINO Intermediate Representation (IR) format and run inference using -OpenVINO Runtime. ``optimum-cli`` provides command line interface for -model conversion and optimization. - -General command format: - -.. code:: bash - - optimum-cli export openvino --model --task - -where task is task to export the model for, if not specified, the task -will be auto-inferred based on the model. You can find a mapping between -tasks and model classes in Optimum TaskManager -`documentation `__. -Additionally, you can specify weights compression using -``--weight-format`` argument with one of following options: ``fp32``, -``fp16``, ``int8`` and ``int4``. For int8 and int4 -`nncf `__ will be used for -weight compression. More details about model export provided in `Optimum -Intel -documentation `__. - -Compress model weights -~~~~~~~~~~~~~~~~~~~~~~ - - - -For reducing model memory consumption we will use weights compression. -The `Weights -Compression `__ -algorithm is aimed at compressing the weights of the models and can be -used to optimize the model footprint and performance of large models -where the size of weights is relatively larger than the size of -activations, for example, Large Language Models (LLM). Compared to INT8 -compression, INT4 compression improves performance even more, but -introduces a minor drop in prediction quality. We will use -`NNCF `__ integration to -``optimum-cli`` tool for weight compression. - -.. code:: ipython3 - - to_compress = widgets.Checkbox( - value=True, - description="Weight compression", - disabled=False, - ) - - to_compress - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Weight compression') - - - -.. code:: ipython3 - - from pathlib import Path - - model_id = model_selector.value - - model_base_dir = Path(model_id.split("/")[-1]) - additional_args = {} - - if to_compress.value: - model_dir = model_base_dir / "INT4" - additional_args.update({"weight-format": "int4", "group-size": "64", "ratio": "1.0"}) - else: - model_dir = model_base_dir / "FP16" - additional_args.update({"weight-format": "fp16"}) - -.. code:: ipython3 - - from cmd_helper import optimum_cli - - if not model_dir.exists(): - optimum_cli(model_id, model_dir, additional_args=additional_args) - -Run OpenVINO model inference ----------------------------- - - - -``OVDiffusionPipeline`` from Optimum Intel provides ready-to-use -interface for running Diffusers models using OpenVINO. It supports -various models including Stable Diffusion, Stable Diffusion XL, LCM, -Stable Diffusion v3 and Flux. Similar to original Diffusers pipeline, -for initialization, we should use ``from_preptrained`` method providing -model id from HuggingFace hub or local directory (both original PyTorch -and OpenVINO models formats supported, in the first case model class -additionally will trigger model conversion). - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget(default="CPU", exclude=["NPU"]) - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -.. code:: ipython3 - - import ipywidgets as widgets - - model_available = (model_base_dir / "INT4").is_dir() - use_quantized_models = widgets.Checkbox( - value=model_available, - description="Use compressed models", - disabled=not model_available, - ) - - use_quantized_models - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Use compressed models') - - - -.. code:: ipython3 - - from optimum.intel.openvino import OVDiffusionPipeline - - model_dir = model_base_dir / "INT4" if use_quantized_models.value else model_base_dir / "FP16" - - ov_pipe = OVDiffusionPipeline.from_pretrained(model_dir, device=device.value) - - -.. parsed-literal:: - - 2024-10-28 18:12:30.714636: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-10-28 18:12:30.727116: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered - WARNING: All log messages before absl::InitializeLog() is called are written to STDERR - E0000 00:00:1730124750.741387 52454 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered - E0000 00:00:1730124750.745955 52454 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered - 2024-10-28 18:12:30.761443: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers - - -.. code:: ipython3 - - import torch - - prompt = "A cat holding a sign that says hello OpenVINO" - image = ov_pipe( - prompt, guidance_scale=0.0, num_inference_steps=4, max_sequence_length=256, generator=torch.Generator("cpu").manual_seed(0), height=256, width=256 - ).images[0] - - image - - - -.. parsed-literal:: - - 0%| | 0/4 [00:00`__ allows alter the voice of -a source speaker to a target style, while keeping the linguistic content -unchanged, without text annotation. - -Figure bellow illustrates model architecture of FreeVC for inference. In -this notebook we concentrate only on inference part. There are three -main parts: Prior Encoder, Speaker Encoder and Decoder. The prior -encoder contains a WavLM model, a bottleneck extractor and a normalizing -flow. Detailed information is available in this -`paper `__. - -.. figure:: https://github.com/OlaWod/FreeVC/blob/main/resources/infer.png?raw=true - :alt: Inference - - Inference - -`\**image_source\* `__ - -FreeVC suggests only command line interface to use and only with CUDA. -In this notebook it shows how to use FreeVC in Python and without CUDA -devices. It consists of the following steps: - -- Download and prepare models. -- Inference. -- Convert models to OpenVINO Intermediate Representation. -- Inference using only OpenVINO’s IR models. - - -**Table of contents:** - - -- `Pre-requisites <#pre-requisites>`__ -- `Imports and settings <#imports-and-settings>`__ -- `Convert Modes to OpenVINO Intermediate - Representation <#convert-modes-to-openvino-intermediate-representation>`__ - - - `Convert Prior Encoder. <#convert-prior-encoder->`__ - - `Convert SpeakerEncoder <#convert-speakerencoder>`__ - - `Convert Decoder <#convert-decoder>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Pre-requisites --------------- - - - -This steps can be done manually or will be performed automatically -during the execution of the notebook, but in minimum necessary scope. 1. -Clone this repo: git clone https://github.com/OlaWod/FreeVC.git. 2. -Download -`WavLM-Large `__ -and put it under directory ``FreeVC/wavlm/``. 3. You can download the -`VCTK `__ dataset. For -this example we download only two of them from `Hugging Face FreeVC -example `__. 4. -Download `pretrained -models `__ -and put it under directory ‘checkpoints’ (for current example only -``freevc.pth`` are required). - -Install extra requirements - -.. code:: ipython3 - - %pip install -q "openvino>=2023.3.0" "librosa>=0.8.1" "webrtcvad==2.0.10" "gradio>=4.19" "torch>=2.1" gdown scipy tqdm torchvision --extra-index-url https://download.pytorch.org/whl/cpu - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - - -.. code:: ipython3 - - # Fetch `notebook_utils` module - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("freevc-voice-conversion.ipynb") - -.. code:: ipython3 - - from pathlib import Path - - import gdown - from cmd_helper import clone_repo - from notebook_utils import download_file, device_widget - - - clone_repo("https://github.com/OlaWod/FreeVC.git") - - - wavlm_large_dir_path = Path("FreeVC/wavlm") - wavlm_large_path = wavlm_large_dir_path / "WavLM-Large.pt" - - wavlm_url = "https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU&confirm=t&uuid=a703c43c-ccce-436c-8799-c11b88e9e7e4" - - if not wavlm_large_path.exists(): - gdown.download(wavlm_url, str(wavlm_large_path)) - - -.. parsed-literal:: - - Downloading... - From: https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU&confirm=t&uuid=a703c43c-ccce-436c-8799-c11b88e9e7e4 - To: /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/freevc-voice-conversion/FreeVC/wavlm/WavLM-Large.pt - 100%|██████████| 1.26G/1.26G [00:19<00:00, 63.4MB/s] - - -.. code:: ipython3 - - freevc_chpt_dir = Path("checkpoints") - freevc_chpt_name = "freevc.pth" - freevc_chpt_path = freevc_chpt_dir / freevc_chpt_name - - if not freevc_chpt_path.exists(): - download_file( - f"https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/freevc/{freevc_chpt_name}", - directory=freevc_chpt_dir, - ) - - - -.. parsed-literal:: - - freevc.pth: 0%| | 0.00/451M [00:00`__. - -Convert Prior Encoder. -~~~~~~~~~~~~~~~~~~~~~~ - - - -First we convert WavLM model, as a part of Convert Prior Encoder to -OpenVINO’s IR format. We keep the original name of the model in code: -``cmodel``. - -.. code:: ipython3 - - # define forward as extract_features for compatibility - cmodel.forward = cmodel.extract_features - -.. code:: ipython3 - - OUTPUT_DIR = Path("output") - BASE_MODEL_NAME = "cmodel" - - OUTPUT_DIR.mkdir(exist_ok=True) - - ir_cmodel_path = Path(OUTPUT_DIR / (BASE_MODEL_NAME + "_ir")).with_suffix(".xml") - - length = 32000 - - dummy_input = torch.randn(1, length) - -Converting to OpenVINO’s IR format. - -.. code:: ipython3 - - core = ov.Core() - - - class ModelWrapper(torch.nn.Module): - def __init__(self, model): - super().__init__() - self.model = model - - def forward(self, input): - return self.model(input)[0] - - - if not ir_cmodel_path.exists(): - ir_cmodel = ov.convert_model(ModelWrapper(cmodel), example_input=dummy_input) - ov.save_model(ir_cmodel, ir_cmodel_path) - else: - ir_cmodel = core.read_model(ir_cmodel_path) - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/freevc-voice-conversion/FreeVC/wavlm/modules.py:495: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert embed_dim == self.embed_dim - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/freevc-voice-conversion/FreeVC/wavlm/modules.py:496: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert list(query.size()) == [tgt_len, bsz, embed_dim] - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/freevc-voice-conversion/FreeVC/wavlm/modules.py:500: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert key_bsz == bsz - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/notebooks/freevc-voice-conversion/FreeVC/wavlm/modules.py:502: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert src_len, bsz == value.shape[:2] - - -Select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device = device_widget() - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - compiled_cmodel = core.compile_model(ir_cmodel, device.value) - -Convert ``SpeakerEncoder`` -~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - OUTPUT_DIR = Path("output") - BASE_MODEL_NAME = "smodel" - - OUTPUT_DIR.mkdir(exist_ok=True) - - ir_smodel_path = Path(OUTPUT_DIR / (BASE_MODEL_NAME + "ir")).with_suffix(".xml") - - - length = 32000 - - dummy_input = torch.randn(1, length, 40) - - if not ir_smodel_path.exists(): - ir_smodel = ov.convert_model(smodel, example_input=dummy_input) - ov.save_model(ir_smodel, ir_smodel_path) - else: - ir_smodel = core.read_model(ir_smodel_path) - -For preparing input for inference, we should define helper functions -based on ``speaker_encoder.voice_encoder.SpeakerEncoder`` class methods - -.. code:: ipython3 - - from speaker_encoder.hparams import sampling_rate, mel_window_step, partials_n_frames - from speaker_encoder import audio - - - def compute_partial_slices(n_samples: int, rate, min_coverage): - """ - Computes where to split an utterance waveform and its corresponding mel spectrogram to - obtain partial utterances of each. Both the waveform and the - mel spectrogram slices are returned, so as to make each partial utterance waveform - correspond to its spectrogram. - - The returned ranges may be indexing further than the length of the waveform. It is - recommended that you pad the waveform with zeros up to wav_slices[-1].stop. - - :param n_samples: the number of samples in the waveform - :param rate: how many partial utterances should occur per second. Partial utterances must - cover the span of the entire utterance, thus the rate should not be lower than the inverse - of the duration of a partial utterance. By default, partial utterances are 1.6s long and - the minimum rate is thus 0.625. - :param min_coverage: when reaching the last partial utterance, it may or may not have - enough frames. If at least of are present, - then the last partial utterance will be considered by zero-padding the audio. Otherwise, - it will be discarded. If there aren't enough frames for one partial utterance, - this parameter is ignored so that the function always returns at least one slice. - :return: the waveform slices and mel spectrogram slices as lists of array slices. Index - respectively the waveform and the mel spectrogram with these slices to obtain the partial - utterances. - """ - assert 0 < min_coverage <= 1 - - # Compute how many frames separate two partial utterances - samples_per_frame = int((sampling_rate * mel_window_step / 1000)) - n_frames = int(np.ceil((n_samples + 1) / samples_per_frame)) - frame_step = int(np.round((sampling_rate / rate) / samples_per_frame)) - assert 0 < frame_step, "The rate is too high" - assert frame_step <= partials_n_frames, "The rate is too low, it should be %f at least" % (sampling_rate / (samples_per_frame * partials_n_frames)) - - # Compute the slices - wav_slices, mel_slices = [], [] - steps = max(1, n_frames - partials_n_frames + frame_step + 1) - for i in range(0, steps, frame_step): - mel_range = np.array([i, i + partials_n_frames]) - wav_range = mel_range * samples_per_frame - mel_slices.append(slice(*mel_range)) - wav_slices.append(slice(*wav_range)) - - # Evaluate whether extra padding is warranted or not - last_wav_range = wav_slices[-1] - coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start) - if coverage < min_coverage and len(mel_slices) > 1: - mel_slices = mel_slices[:-1] - wav_slices = wav_slices[:-1] - - return wav_slices, mel_slices - - - def embed_utterance( - wav: np.ndarray, - smodel: ov.CompiledModel, - return_partials=False, - rate=1.3, - min_coverage=0.75, - ): - """ - Computes an embedding for a single utterance. The utterance is divided in partial - utterances and an embedding is computed for each. The complete utterance embedding is the - L2-normed average embedding of the partial utterances. - - :param wav: a preprocessed utterance waveform as a numpy array of float32 - :param smodel: compiled speaker encoder model. - :param return_partials: if True, the partial embeddings will also be returned along with - the wav slices corresponding to each partial utterance. - :param rate: how many partial utterances should occur per second. Partial utterances must - cover the span of the entire utterance, thus the rate should not be lower than the inverse - of the duration of a partial utterance. By default, partial utterances are 1.6s long and - the minimum rate is thus 0.625. - :param min_coverage: when reaching the last partial utterance, it may or may not have - enough frames. If at least of are present, - then the last partial utterance will be considered by zero-padding the audio. Otherwise, - it will be discarded. If there aren't enough frames for one partial utterance, - this parameter is ignored so that the function always returns at least one slice. - :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If - is True, the partial utterances as a numpy array of float32 of shape - (n_partials, model_embedding_size) and the wav partials as a list of slices will also be - returned. - """ - # Compute where to split the utterance into partials and pad the waveform with zeros if - # the partial utterances cover a larger range. - wav_slices, mel_slices = compute_partial_slices(len(wav), rate, min_coverage) - max_wave_length = wav_slices[-1].stop - if max_wave_length >= len(wav): - wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant") - - # Split the utterance into partials and forward them through the model - mel = audio.wav_to_mel_spectrogram(wav) - mels = np.array([mel[s] for s in mel_slices]) - with torch.no_grad(): - mels = torch.from_numpy(mels).to(torch.device("cpu")) - output_layer = smodel.output(0) - partial_embeds = smodel(mels)[output_layer] - - # Compute the utterance embedding from the partial embeddings - raw_embed = np.mean(partial_embeds, axis=0) - embed = raw_embed / np.linalg.norm(raw_embed, 2) - - if return_partials: - return embed, partial_embeds, wav_slices - return embed - -Select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Then compile model. - -.. code:: ipython3 - - compiled_smodel = core.compile_model(ir_smodel, device.value) - -Convert Decoder -~~~~~~~~~~~~~~~ - - - -In the same way export ``SynthesizerTrn`` model, that implements decoder -function to OpenVINO IR format. - -.. code:: ipython3 - - OUTPUT_DIR = Path("output") - BASE_MODEL_NAME = "net_g" - onnx_net_g_path = Path(OUTPUT_DIR / (BASE_MODEL_NAME + "_fp32")).with_suffix(".onnx") - ir_net_g_path = Path(OUTPUT_DIR / (BASE_MODEL_NAME + "ir")).with_suffix(".xml") - - dummy_input_1 = torch.randn(1, 1024, 81) - dummy_input_2 = torch.randn(1, 256) - - # define forward as infer - net_g.forward = net_g.infer - - - if not ir_net_g_path.exists(): - ir_net_g_model = ov.convert_model(net_g, example_input=(dummy_input_1, dummy_input_2)) - ov.save_model(ir_net_g_model, ir_net_g_path) - else: - ir_net_g_model = core.read_model(ir_net_g_path) - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error: - Tensor-likes are not close! - - Mismatched elements: 25906 / 25920 (99.9%) - Greatest absolute difference: 0.38282541930675507 at index (0, 0, 4270) (up to 1e-05 allowed) - Greatest relative difference: 11932.686300610094 at index (0, 0, 25133) (up to 1e-05 allowed) - _check_trace( - - -Select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - compiled_ir_net_g_model = core.compile_model(ir_net_g_model, device.value) - -Define function for synthesizing. - -.. code:: ipython3 - - def synthesize_audio(src, tgt): - wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate) - wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20) - - g_tgt = embed_utterance(wav_tgt, compiled_smodel) - g_tgt = torch.from_numpy(g_tgt).unsqueeze(0) - - # src - wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate) - wav_src = np.expand_dims(wav_src, axis=0) - - output_layer = compiled_cmodel.output(0) - c = compiled_cmodel(wav_src)[output_layer] - c = c.transpose((0, 2, 1)) - - output_layer = compiled_ir_net_g_model.output(0) - tgt_audio = compiled_ir_net_g_model((c, g_tgt))[output_layer] - tgt_audio = tgt_audio[0][0] - - return tgt_audio - -And now we can check inference using only IR models. - -.. code:: ipython3 - - result_wav_names = [] - - with torch.no_grad(): - for line in tqdm(zip(srcs, tgts)): - src, tgt = line - - output_audio = synthesize_audio(src, tgt) - - timestamp = time.strftime("%m-%d_%H-%M", time.localtime()) - result_name = f"{timestamp}.wav" - result_wav_names.append(result_name) - write( - os.path.join("outputs/freevc", result_name), - hps.data.sampling_rate, - output_audio, - ) - - -.. parsed-literal:: - - 2it [00:01, 1.29it/s] - - -Result audio files should be available in ‘outputs/freevc’ and you can -check them and compare with generated earlier. Below one of the results -presents. - -Source audio (source of text): - -.. code:: ipython3 - - import IPython.display as ipd - - ipd.Audio(srcs[0]) - - - - -.. raw:: html - - - - - - - -Target audio (source of voice): - -.. code:: ipython3 - - ipd.Audio(tgts[0]) - - - - -.. raw:: html - - - - - - - -Result audio: - -.. code:: ipython3 - - ipd.Audio(f"outputs/freevc/{result_wav_names[0]}") - - - - -.. raw:: html - - - - - - - -Also, you can use your own audio file. Just upload them and use for -inference. Use rate corresponding to the value of -``hps.data.sampling_rate``. - -.. code:: ipython3 - - def infer(src, tgt): - output_audio = synthesize_audio(src, tgt) - timestamp = time.strftime("%m-%d_%H-%M", time.localtime()) - result_name = f"{timestamp}.wav" - write(result_name, hps.data.sampling_rate, output_audio) - return result_name - - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/freevc-voice-conversion/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=infer) - - try: - demo.queue().launch(debug=False) - except Exception: - demo.queue().launch(debug=False, share=True) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/" - - -.. parsed-literal:: - - Running on local URL: http://127.0.0.1:7860 - - To create a public link, set `share=True` in `launch()`. - - - - - - - - -.. code:: ipython3 - - # please uncomment and run this cell for stopping gradio interface - # demo.close() diff --git a/docs/notebooks/glm-edge-v-with-output.rst b/docs/notebooks/glm-edge-v-with-output.rst deleted file mode 100644 index 027d1e1f1a5a9d..00000000000000 --- a/docs/notebooks/glm-edge-v-with-output.rst +++ /dev/null @@ -1,518 +0,0 @@ -Visual-language assistant with GLM-Edge-V and OpenVINO ------------------------------------------------------- - -The -`GLM-Edge `__ -series is `Zhipu `__\ ’s attempt to meet -real-world deployment scenarios for edge devices. It consists of two -sizes of large language dialogue models and multimodal understanding -models (GLM-Edge-1.5B-Chat, GLM-Edge-4B-Chat, GLM-Edge-V-2B, -GLM-Edge-V-5B). Among them, the 1.5B / 2B models are mainly targeted at -platforms like mobile phones and car machines, while the 4B / 5B models -are aimed at platforms like PCs. Based on the technological advancements -of the GLM-4 series, some targeted adjustments have been made to the -model structure and size, balancing model performance, real-world -inference efficiency, and deployment convenience. Through deep -collaboration with partner enterprises and relentless efforts in -inference optimization, the GLM-Edge series models can run at extremely -high speeds on some edge platforms. - -In this tutorial we consider how to launch multimodal model GLM-Edge-V -using OpenVINO for creation multimodal chatbot. Additionally, we -optimize model to low precision using -`NNCF `__ - -**Table of contents:** - -- `Prerequisites <#prerequisites>`__ -- `Select Model <#select-model>`__ -- `Convert and Optimize model <#convert-and-optimize-model>`__ - - - `Compress model weights to - 4-bit <#compress-model-weights-to-4-bit>`__ - -- `Select inference device <#select-inference-device>`__ -- `Run OpenVINO model <#run-openvino-model>`__ -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -install required packages and setup helper functions. - -.. code:: ipython3 - - %pip install -q "torch>=2.1" "torchvision" "protobuf>=3.20" "gradio>=4.26" "Pillow" "accelerate" "tqdm" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "openvino>=2024.5.0" "nncf>=2.14.0" - %pip install -q "git+https://github.com/huggingface/transformers" --extra-index-url https://download.pytorch.org/whl/cpu - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - ERROR: Could not find a version that satisfies the requirement openvino>=2024.5.0 (from versions: 2021.3.0, 2021.4.0, 2021.4.1, 2021.4.2, 2022.1.0, 2022.2.0, 2022.3.0, 2022.3.1, 2022.3.2, 2023.0.0.dev20230119, 2023.0.0.dev20230217, 2023.0.0.dev20230407, 2023.0.0.dev20230427, 2023.0.0, 2023.0.1, 2023.0.2, 2023.1.0.dev20230623, 2023.1.0.dev20230728, 2023.1.0.dev20230811, 2023.1.0, 2023.2.0.dev20230922, 2023.2.0, 2023.3.0, 2024.0.0, 2024.1.0, 2024.2.0, 2024.3.0, 2024.4.0, 2024.4.1.dev20240926) - ERROR: No matching distribution found for openvino>=2024.5.0 - Note: you may need to restart the kernel to use updated packages. - error: subprocess-exited-with-error - - × Preparing metadata (pyproject.toml) did not run successfully. - │ exit code: 1 - ╰─> [6 lines of output] - - Cargo, the Rust package manager, is not installed or is not on PATH. - This package requires Rust and Cargo to compile extensions. Install it through - the system's package manager or via https://rustup.rs/ - - Checking for Rust toolchain.... - [end of output] - - note: This error originates from a subprocess, and is likely not a problem with pip. - error: metadata-generation-failed - - × Encountered error while generating package metadata. - ╰─> See above for output. - - note: This is an issue with the package mentioned above, not pip. - hint: See above for details. - Note: you may need to restart the kernel to use updated packages. - - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("glmv_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/glm-edge-v/glmv_helper.py") - open("glmv_helper.py", "w").write(r.text) - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/glm-edge-v/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - if not Path("notebook_utils.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("glm-edge-v.ipynb") - -Select Model ------------- - - - -The tutorial supports the following models from GLM-Edge-V model family: - -- `glm-edge-v-2b `__ -- `glm-edge-v-5b `__ - -You can select one from the provided options below. - -.. code:: ipython3 - - import ipywidgets as widgets - - # Select model - model_ids = [ - "THUDM/glm-edge-v-2b", - "THUDM/glm-edge-v-5b", - ] - - model_dropdown = widgets.Dropdown( - options=model_ids, - value=model_ids[0], - description="Model:", - disabled=False, - ) - - model_dropdown - - - - -.. parsed-literal:: - - Dropdown(description='Model:', options=('THUDM/glm-edge-v-2b', 'THUDM/glm-edge-v-5b'), value='THUDM/glm-edge-v… - - - -Convert and Optimize model --------------------------- - - - -GLM-Edge-V is PyTorch model. OpenVINO supports PyTorch models via -conversion to OpenVINO Intermediate Representation (IR). `OpenVINO model -conversion -API `__ -should be used for these purposes. ``ov.convert_model`` function accepts -original PyTorch model instance and example input for tracing and -returns ``ov.Model`` representing this model in OpenVINO framework. -Converted model can be used for saving on disk using ``ov.save_model`` -function or directly loading on device using ``core.complie_model``. - -The script ``glmv_helper.py`` contains helper function for model -conversion, please check its content if you interested in conversion -details. - -.. raw:: html - -
- -Click here for more detailed explanation of conversion steps GLM-Edge-V -is autoregressive transformer generative model, it means that each next -model step depends from model output from previous step. The generation -approach is based on the assumption that the probability distribution of -a word sequence can be decomposed into the product of conditional next -word distributions. In other words, model predicts the next token in the -loop guided by previously generated tokens until the stop-condition will -be not reached (generated sequence of maximum length or end of string -token obtained). The way the next token will be selected over predicted -probabilities is driven by the selected decoding methodology. You can -find more information about the most popular decoding methods in this -blog. The entry point for the generation process for models from the -Hugging Face Transformers library is the ``generate`` method. You can -find more information about its parameters and configuration in the -documentation. To preserve flexibility in the selection decoding -methodology, we will convert only model inference for one step. - -GLM-Edge-V model consists of 3 parts: - -- **Vision Model** for encoding input images into embedding space. -- **Embedding Model** for conversion input text tokens into embedding - space -- **Language Model** for generation answer based on input embeddings - provided by Image Encoder and Input Embedding models. - -.. raw:: html - -
- -Compress model weights to 4-bit -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -For reducing memory -consumption, weights compression optimization can be applied using -`NNCF `__. - -.. raw:: html - -
- -Click here for more details about weight compression Weight compression -aims to reduce the memory footprint of a model. It can also lead to -significant performance improvement for large memory-bound models, such -as Large Language Models (LLMs). LLMs and other models, which require -extensive memory to store the weights during inference, can benefit from -weight compression in the following ways: - -- enabling the inference of exceptionally large models that cannot be - accommodated in the memory of the device; - -- improving the inference performance of the models by reducing the - latency of the memory access when computing the operations with - weights, for example, Linear layers. - -`Neural Network Compression Framework -(NNCF) `__ provides 4-bit / -8-bit mixed weight quantization as a compression method primarily -designed to optimize LLMs. The main difference between weights -compression and full model quantization (post-training quantization) is -that activations remain floating-point in the case of weights -compression which leads to a better accuracy. Weight compression for -LLMs provides a solid inference performance improvement which is on par -with the performance of the full model quantization. In addition, weight -compression is data-free and does not require a calibration dataset, -making it easy to use. - -``nncf.compress_weights`` function can be used for performing weights -compression. The function accepts an OpenVINO model and other -compression parameters. Compared to INT8 compression, INT4 compression -improves performance even more, but introduces a minor drop in -prediction quality. - -More details about weights compression, can be found in `OpenVINO -documentation `__. - -.. raw:: html - -
- -.. code:: ipython3 - - from pathlib import Path - import nncf - from glmv_helper import convert_glmv_model - - - model_id = model_dropdown.value - out_dir = Path("model") / Path(model_id).name / "INT4" - compression_configuration = { - "mode": nncf.CompressWeightsMode.INT4_SYM, - "group_size": 64, - "ratio": 0.6, - } - convert_glmv_model(model_id, out_dir, compression_configuration) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -.. parsed-literal:: - - 2025-02-04 02:31:50.386808: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2025-02-04 02:31:50.420435: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2025-02-04 02:31:50.973389: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - - -.. parsed-literal:: - - ⌛ glm-edge-v-2b conversion started. Be patient, it may takes some time. - ⌛ Load Original model - ✅ Original model successfully loaded - ⌛ Convert Input embedding model - WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11. - - -.. parsed-literal:: - - [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead - warnings.warn( - `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. - - -.. parsed-literal:: - - ✅ Input embedding model successfully converted - ⌛ Convert Image embedding model - - -.. parsed-literal:: - - /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/THUDM/glm-edge-v-2b/2053707733f99ab52e943904f43c2359a94301ef/siglip.py:48: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - grid_size = int(s**0.5) - /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/THUDM/glm-edge-v-2b/2053707733f99ab52e943904f43c2359a94301ef/siglip.py:53: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. - image_emb = torch.cat([self.boi.repeat(len(image_emb), 1, 1), image_emb, self.eoi.repeat(len(image_emb), 1, 1)], dim=1) - - -.. parsed-literal:: - - ✅ Image embedding model successfully converted - ⌛ Convert Language model - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:458: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. - or len(self.key_cache[layer_idx]) == 0 # the layer has no cache - /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/THUDM/glm-edge-v-2b/2053707733f99ab52e943904f43c2359a94301ef/modeling_glm.py:1010: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if sequence_length != 1: - /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/THUDM/glm-edge-v-2b/2053707733f99ab52e943904f43c2359a94301ef/modeling_glm.py:153: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - rotary_dim = int(q.shape[-1] * partial_rotary_factor) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. - elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors - /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/THUDM/glm-edge-v-2b/2053707733f99ab52e943904f43c2359a94301ef/modeling_glm.py:249: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:168: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at aten/src/ATen/core/TensorBody.h:489.) - if a.grad is not None: - - -.. parsed-literal:: - - ✅ Language model successfully converted - ⌛ Weights compression with int4_sym mode started - - - -.. parsed-literal:: - - Output() - - - - - - - - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 45% (115 / 169) │ 40% (114 / 168) │ - ├────────────────┼─────────────────────────────┼────────────────────────────────────────┤ - │ 4 │ 55% (54 / 169) │ 60% (54 / 168) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - - - -.. parsed-literal:: - - Output() - - - - - - - - - -.. parsed-literal:: - - ✅ Weights compression finished - ✅ glm-edge-v-2b model conversion finished. You can find results in model/glm-edge-v-2b/INT4 - - -Select inference device ------------------------ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget(default="AUTO", exclude=["NPU"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Run OpenVINO model ------------------- - - - -``OvGLMv`` class provides convenient way for running model. It accepts -directory with converted model and inference device as arguments. For -running model we will use ``generate`` method. - -.. code:: ipython3 - - from glmv_helper import OvGLMv - - model = OvGLMv(out_dir, device.value) - -.. code:: ipython3 - - import requests - from PIL import Image - - image_path = Path("cat.png") - - if not image_path.exists(): - url = "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11" - image = Image.open(requests.get(url, stream=True).raw) - image.save(image_path) - else: - image = Image.open(image_path) - - query = "Please describe this picture" - - print(f"Question:\n {query}") - image - - -.. parsed-literal:: - - Question: - Please describe this picture - - - - -.. image:: glm-edge-v-with-output_files/glm-edge-v-with-output_12_1.png - - - -.. code:: ipython3 - - from transformers import TextStreamer, AutoImageProcessor, AutoTokenizer - import torch - - messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": query}]}] - - processor = AutoImageProcessor.from_pretrained(out_dir, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(out_dir, trust_remote_code=True) - inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_dict=True, tokenize=True, return_tensors="pt").to("cpu") - generate_kwargs = { - **inputs, - "pixel_values": torch.tensor(processor(image).pixel_values).to("cpu"), - "max_new_tokens": 100, - "do_sample": True, - "top_k": 20, - "streamer": TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True), - } - - print("Answer:") - output = model.generate(**generate_kwargs) - - -.. parsed-literal:: - - Answer: - This image is of a grey cat laying comfortably inside an open cardboard box with its back turned to the viewer. The box is situated on a white carpet, suggesting a cozy indoor setting. There is furniture visible in the background, though it partially blurs out the details. The cat appears relaxed, with one of its paws stretched out, and its tail extended to the side, indicating it's enjoying some rest. The room has a soft, well-maintained appearance, with - - -Interactive demo ----------------- - - - -.. code:: ipython3 - - from gradio_helper import make_demo - - demo = make_demo(model, processor, tokenizer) - - try: - demo.launch(debug=False, height=600) - except Exception: - demo.launch(debug=False, share=True, height=600) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ - - -.. parsed-literal:: - - Running on local URL: http://127.0.0.1:7860 - - To create a public link, set `share=True` in `launch()`. - - - - - - - diff --git a/docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_12_1.jpg b/docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_12_1.jpg deleted file mode 100644 index c6aeec77cd3cb2..00000000000000 --- a/docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_12_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5fc0d22d75f23474fb4f8aec8c0bf0fdf5d9377f3379e82a3887003e6da47e7e -size 60425 diff --git a/docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_12_1.png b/docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_12_1.png deleted file mode 100644 index c6673a757ab5dc..00000000000000 --- a/docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_12_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c715d8adee4bf7519690de20b57ef2edaa2f914c86a64d107f99a919dcdad218 -size 854224 diff --git a/docs/notebooks/gpu-device-with-output.rst b/docs/notebooks/gpu-device-with-output.rst deleted file mode 100644 index e96c3011fb325b..00000000000000 --- a/docs/notebooks/gpu-device-with-output.rst +++ /dev/null @@ -1,1452 +0,0 @@ -Working with GPUs in OpenVINO™ -============================== - - -**Table of contents:** - - -- `Introduction <#introduction>`__ - - - `Install required packages <#install-required-packages>`__ - -- `Checking GPUs with Query - Device <#checking-gpus-with-query-device>`__ - - - `List GPUs with - core.available_devices <#list-gpus-with-core-available_devices>`__ - - `Check Properties with - core.get_property <#check-properties-with-core-get_property>`__ - - `Brief Descriptions of Key - Properties <#brief-descriptions-of-key-properties>`__ - -- `Compiling a Model on GPU <#compiling-a-model-on-gpu>`__ - - - `Download a Model <#download-a-model>`__ - - `Compile with Default - Configuration <#compile-with-default-configuration>`__ - - `Reduce Compile Time through Model - Caching <#reduce-compile-time-through-model-caching>`__ - - `Throughput and Latency Performance - Hints <#throughput-and-latency-performance-hints>`__ - - `Using Multiple GPUs with Multi-Device and Cumulative - Throughput <#using-multiple-gpus-with-multi-device-and-cumulative-throughput>`__ - -- `Performance Comparison with - benchmark_app <#performance-comparison-with-benchmark_app>`__ - - - `CPU vs GPU with Latency Hint <#cpu-vs-gpu-with-latency-hint>`__ - - `CPU vs GPU with Throughput - Hint <#cpu-vs-gpu-with-throughput-hint>`__ - - `Single GPU vs Multiple GPUs <#single-gpu-vs-multiple-gpus>`__ - -- `Basic Application Using GPUs <#basic-application-using-gpus>`__ - - - `Import Necessary Packages <#import-necessary-packages>`__ - - `Compile the Model <#compile-the-model>`__ - - `Load and Preprocess Video - Frames <#load-and-preprocess-video-frames>`__ - - `Define Model Output Classes <#define-model-output-classes>`__ - - `Set up Asynchronous Pipeline <#set-up-asynchronous-pipeline>`__ - - - `Callback Definition <#callback-definition>`__ - - `Create Async Pipeline <#create-async-pipeline>`__ - - - `Perform Inference <#perform-inference>`__ - - `Process Results <#process-results>`__ - -- `Conclusion <#conclusion>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -This tutorial provides a high-level overview of working with Intel GPUs -in OpenVINO. It shows how to use Query Device to list system GPUs and -check their properties, and it explains some of the key properties. It -shows how to compile a model on GPU with performance hints and how to -use multiple GPUs using MULTI or CUMULATIVE_THROUGHPUT. - -The tutorial also shows example commands for benchmark_app that can be -run to compare GPU performance in different configurations. It also -provides the code for a basic end-to-end application that compiles a -model on GPU and uses it to run inference. - -Introduction ------------- - - - -Originally, graphic processing units (GPUs) began as specialized chips, -developed to accelerate the rendering of computer graphics. In contrast -to CPUs, which have few but powerful cores, GPUs have many more -specialized cores, making them ideal for workloads that can be -parallelized into simpler tasks. Nowadays, one such workload is deep -learning, where GPUs can easily accelerate inference of neural networks -by splitting operations across multiple cores. - -OpenVINO supports inference on Intel integrated GPUs (which are included -with most `Intel® Core™ desktop and mobile -processors `__) -or on Intel discrete GPU products like the `Intel® Arc™ A-Series -Graphics -cards `__ -and `Intel® Data Center GPU Flex -Series `__. -To get started, first `install -OpenVINO `__ -on a system equipped with one or more Intel GPUs. Follow the `GPU -configuration -instructions `__ -to configure OpenVINO to work with your GPU. Then, read on to learn how -to accelerate inference with GPUs in OpenVINO! - -Install required packages -~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %pip install -q "openvino>=2024.4.0" "opencv-python" "tqdm" "huggingface_hub" - -Checking GPUs with Query Device -------------------------------- - - - -In this section, we will see how to list the available GPUs and check -their properties. Some of the key properties will also be defined. - -List GPUs with core.available_devices -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -OpenVINO Runtime provides the ``available_devices`` method for checking -which devices are available for inference. The following code will -output a list of compatible OpenVINO devices, in which Intel GPUs should -appear. - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - core.available_devices - - - - -.. parsed-literal:: - - ['CPU', 'GPU'] - - - -Note that GPU devices are numbered starting at 0, where the integrated -GPU always takes the id ``0`` if the system has one. For instance, if -the system has a CPU, an integrated and discrete GPU, we should expect -to see a list like this: ``['CPU', 'GPU.0', 'GPU.1']``. To simplify its -use, the “GPU.0” can also be addressed with just “GPU”. For more -details, see the `Device Naming -Convention `__ -section. - -If the GPUs are installed correctly on the system and still do not -appear in the list, follow the steps described -`here `__ -to configure your GPU drivers to work with OpenVINO. Once we have the -GPUs working with OpenVINO, we can proceed with the next sections. - -Check Properties with core.get_property -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To get information about the GPUs, we can use device properties. In -OpenVINO, devices have properties that describe their characteristics -and configuration. Each property has a name and associated value that -can be queried with the ``get_property`` method. - -To get the value of a property, such as the device name, we can use the -``get_property`` method as follows: - -.. code:: ipython3 - - import openvino.properties as props - - - device = "GPU" - - core.get_property(device, props.device.full_name) - - - - -.. parsed-literal:: - - 'Intel(R) Graphics [0x46a6] (iGPU)' - - - -Each device also has a specific property called -``SUPPORTED_PROPERTIES``, that enables viewing all the available -properties in the device. We can check the value for each property by -simply looping through the dictionary returned by -``core.get_property("GPU", props.supported_properties)`` and then -querying for that property. - -.. code:: ipython3 - - print(f"{device} SUPPORTED_PROPERTIES:\n") - supported_properties = core.get_property(device, props.supported_properties) - indent = len(max(supported_properties, key=len)) - - for property_key in supported_properties: - if property_key not in ( - "SUPPORTED_METRICS", - "SUPPORTED_CONFIG_KEYS", - "SUPPORTED_PROPERTIES", - ): - try: - property_val = core.get_property(device, property_key) - except TypeError: - property_val = "UNSUPPORTED TYPE" - print(f"{property_key:<{indent}}: {property_val}") - - -.. parsed-literal:: - - GPU SUPPORTED_PROPERTIES: - - AVAILABLE_DEVICES : ['0'] - RANGE_FOR_ASYNC_INFER_REQUESTS: (1, 2, 1) - RANGE_FOR_STREAMS : (1, 2) - OPTIMAL_BATCH_SIZE : 1 - MAX_BATCH_SIZE : 1 - CACHING_PROPERTIES : {'GPU_UARCH_VERSION': 'RO', 'GPU_EXECUTION_UNITS_COUNT': 'RO', 'GPU_DRIVER_VERSION': 'RO', 'GPU_DEVICE_ID': 'RO'} - DEVICE_ARCHITECTURE : GPU: v12.0.0 - FULL_DEVICE_NAME : Intel(R) Graphics [0x46a6] (iGPU) - DEVICE_UUID : UNSUPPORTED TYPE - DEVICE_TYPE : Type.INTEGRATED - DEVICE_GOPS : UNSUPPORTED TYPE - OPTIMIZATION_CAPABILITIES : ['FP32', 'BIN', 'FP16', 'INT8'] - GPU_DEVICE_TOTAL_MEM_SIZE : UNSUPPORTED TYPE - GPU_UARCH_VERSION : 12.0.0 - GPU_EXECUTION_UNITS_COUNT : 96 - GPU_MEMORY_STATISTICS : UNSUPPORTED TYPE - PERF_COUNT : False - MODEL_PRIORITY : Priority.MEDIUM - GPU_HOST_TASK_PRIORITY : Priority.MEDIUM - GPU_QUEUE_PRIORITY : Priority.MEDIUM - GPU_QUEUE_THROTTLE : Priority.MEDIUM - GPU_ENABLE_LOOP_UNROLLING : True - CACHE_DIR : - PERFORMANCE_HINT : PerformanceMode.UNDEFINED - COMPILATION_NUM_THREADS : 20 - NUM_STREAMS : 1 - PERFORMANCE_HINT_NUM_REQUESTS : 0 - INFERENCE_PRECISION_HINT : - DEVICE_ID : 0 - - -Brief Descriptions of Key Properties -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Each device has several properties as seen in the last command. Some of -the key properties are: - -- ``FULL_DEVICE_NAME`` - The product name of the GPU and whether it is - an integrated or discrete GPU (iGPU or dGPU). -- ``OPTIMIZATION_CAPABILITIES`` - The model data types (INT8, FP16, - FP32, etc) that are supported by this GPU. -- ``GPU_EXECUTION_UNITS_COUNT`` - The execution cores available in the - GPU’s architecture, which is a relative measure of the GPU’s - processing power. -- ``RANGE_FOR_STREAMS`` - The number of processing streams available on - the GPU that can be used to execute parallel inference requests. When - compiling a model in LATENCY or THROUGHPUT mode, OpenVINO will - automatically select the best number of streams for low latency or - high throughput. -- ``PERFORMANCE_HINT`` - A high-level way to tune the device for a - specific performance metric, such as latency or throughput, without - worrying about device-specific settings. -- ``CACHE_DIR`` - The directory where the model cache data is stored to - speed up compilation time. - -To learn more about devices and properties, see the `Query Device -Properties `__ -page. - -Compiling a Model on GPU ------------------------- - - - -Now, we know how to list the GPUs in the system and check their -properties. We can easily use one for compiling and running models with -OpenVINO `GPU -plugin `__. - -Download a Model -~~~~~~~~~~~~~~~~ - - - -This tutorial uses the ``ssdlite_mobilenet_v2`` model. The -``ssdlite_mobilenet_v2`` model is used for object detection. The model -was trained on `Common Objects in Context -(COCO) `__ dataset version with 91 -categories of object. For details, see the -`paper `__. - -.. code:: ipython3 - - import huggingface_hub as hf_hub - from pathlib import Path - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - # A directory where the model will be downloaded. - base_model_dir = Path("./model").expanduser() - - model_name = "ssdlite_mobilenet_v2_fp16" - - ov_model_path = base_model_dir / model_name / f"{model_name}.xml" - - if not (ov_model_path).exists(): - hf_hub.snapshot_download("katuni4ka/ssdlite_mobilenet_v2_fp16", local_dir=base_model_dir / model_name) - - model = core.read_model(ov_model_path) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("gpu-device.ipynb") - -Compile with Default Configuration -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -When the model is ready, first we need to read it, using the -``read_model`` method. Then, we can use the ``compile_model`` method and -specify the name of the device we want to compile the model on, in this -case, “GPU”. - -.. code:: ipython3 - - compiled_model = core.compile_model(model, device) - -If you have multiple GPUs in the system, you can specify which one to -use by using “GPU.0”, “GPU.1”, etc. Any of the device names returned by -the ``available_devices`` method are valid device specifiers. You may -also use “AUTO”, which will automatically select the best device for -inference (which is often the GPU). To learn more about AUTO plugin, -visit the `Automatic Device -Selection `__ -page as well as the `AUTO device -tutorial `__. - -Reduce Compile Time through Model Caching -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Depending on the model used, device-specific optimizations and network -compilations can cause the compile step to be time-consuming, especially -with larger models, which may lead to bad user experience in the -application, in which they are used. To solve this, OpenVINO can cache -the model once it is compiled on supported devices and reuse it in later -``compile_model`` calls by simply setting a cache folder beforehand. For -instance, to cache the same model we compiled above, we can do the -following: - -.. code:: ipython3 - - import time - from pathlib import Path - - # Create cache folder - cache_folder = Path("cache") - cache_folder.mkdir(exist_ok=True) - - start = time.time() - core = ov.Core() - - # Set cache folder - core.set_property({props.cache_dir(): cache_folder}) - - # Compile the model as before - model = core.read_model(ov_model_path) - compiled_model = core.compile_model(model, device) - print(f"Cache enabled (first time) - compile time: {time.time() - start}s") - - -.. parsed-literal:: - - Cache enabled (first time) - compile time: 1.692436695098877s - - -To get an idea of the effect that caching can have, we can measure the -compile times with caching enabled and disabled as follows: - -.. code:: ipython3 - - start = time.time() - core = ov.Core() - core.set_property({props.cache_dir(): "cache"}) - model = core.read_model(model=ov_model_path) - compiled_model = core.compile_model(model, device) - print(f"Cache enabled - compile time: {time.time() - start}s") - - start = time.time() - core = ov.Core() - model = core.read_model(ov_model_path) - compiled_model = core.compile_model(model, device) - print(f"Cache disabled - compile time: {time.time() - start}s") - - -.. parsed-literal:: - - Cache enabled - compile time: 0.26888394355773926s - Cache disabled - compile time: 1.982884168624878s - - -The actual time improvements will depend on the environment as well as -the model being used but it is definitely something to consider when -optimizing an application. To read more about this, see the `Model -Caching `__ -docs. - -Throughput and Latency Performance Hints -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To simplify device and pipeline configuration, OpenVINO provides -high-level performance hints that automatically set the batch size and -number of parallel threads to use for inference. The “LATENCY” -performance hint optimizes for fast inference times while the -“THROUGHPUT” performance hint optimizes for high overall bandwidth or -FPS. - -To use the “LATENCY” performance hint, add -``{hints.performance_mode(): hints.PerformanceMode.LATENCY}`` when -compiling the model as shown below. For GPUs, this automatically -minimizes the batch size and number of parallel streams such that all of -the compute resources can focus on completing a single inference as fast -as possible. - -.. code:: ipython3 - - import openvino.properties.hint as hints - - - compiled_model = core.compile_model(model, device, {hints.performance_mode(): hints.PerformanceMode.LATENCY}) - -To use the “THROUGHPUT” performance hint, add -``{hints.performance_mode(): hints.PerformanceMode.THROUGHPUT}`` when -compiling the model. For GPUs, this creates multiple processing streams -to efficiently utilize all the execution cores and optimizes the batch -size to fill the available memory. - -.. code:: ipython3 - - compiled_model = core.compile_model(model, device, {hints.performance_mode(): hints.PerformanceMode.THROUGHPUT}) - -Using Multiple GPUs with Multi-Device and Cumulative Throughput -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The latency and throughput hints mentioned above are great and can make -a difference when used adequately but they usually use just one device, -either due to the `AUTO -plugin `__ -or by manual specification of the device name as above. When we have -multiple devices, such as an integrated and discrete GPU, we may use -both at the same time to improve the utilization of the resources. In -order to do this, OpenVINO provides a virtual device called -`MULTI `__, -which is just a combination of the existent devices that knows how to -split inference work between them, leveraging the capabilities of each -device. - -As an example, if we want to use both integrated and discrete GPUs and -the CPU at the same time, we can compile the model as follows: - -``compiled_model = core.compile_model(model=model, device_name="MULTI:GPU.1,GPU.0,CPU")`` - -Note that we always need to explicitly specify the device list for MULTI -to work, otherwise MULTI does not know which devices are available for -inference. However, this is not the only way to use multiple devices in -OpenVINO. There is another performance hint called -“CUMULATIVE_THROUGHPUT” that works similar to MULTI, except it uses the -devices automatically selected by AUTO. This way, we do not need to -manually specify devices to use. Below is an example showing how to use -“CUMULATIVE_THROUGHPUT”, equivalent to the MULTI one: - -\` - -compiled_model = core.compile_model(model=model, device_name=“AUTO”, -config={hints.performance_mode(): -hints.PerformanceMode.CUMULATIVE_THROUGHPUT}) \` - - **Important**: **The “THROUGHPUT”, “MULTI”, and - “CUMULATIVE_THROUGHPUT” modes are only applicable to asynchronous - inferencing pipelines. The example at the end of this article shows - how to set up an asynchronous pipeline that takes advantage of - parallelism to increase throughput.** To learn more, see - `Asynchronous - Inferencing `__ - in OpenVINO as well as the `Asynchronous Inference - notebook `__. - -Performance Comparison with benchmark_app ------------------------------------------ - - - -Given all the different options available when compiling a model, it may -be difficult to know which settings work best for a certain application. -Thankfully, OpenVINO provides ``benchmark_app`` - a performance -benchmarking tool. - -The basic syntax of ``benchmark_app`` is as follows: - -``benchmark_app -m PATH_TO_MODEL -d TARGET_DEVICE -hint {throughput,cumulative_throughput,latency,none}`` - -where ``TARGET_DEVICE`` is any device shown by the ``available_devices`` -method as well as the MULTI and AUTO devices we saw previously, and the -value of hint should be one of the values between brackets. - -Note that benchmark_app only requires the model path to run but both the -device and hint arguments will be useful to us. For more advanced -usages, the tool itself has other options that can be checked by running -``benchmark_app -h`` or reading the -`docs `__. -The following example shows how to benchmark a simple model, using a GPU -with a latency focus: - -.. code:: ipython3 - - !benchmark_app -m {ov_model_path} -d GPU -hint latency - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3 - [ INFO ] - [ INFO ] Device info: - [ INFO ] GPU - [ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 14.02 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] image_tensor , image_tensor:0 (node: image_tensor) : u8 / [N,H,W,C] / [1,300,300,3] - [ INFO ] Model outputs: - [ INFO ] detection_boxes:0 (node: DetectionOutput) : f32 / [...] / [1,1,100,7] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] image_tensor , image_tensor:0 (node: image_tensor) : u8 / [N,H,W,C] / [1,300,300,3] - [ INFO ] Model outputs: - [ INFO ] detection_boxes:0 (node: DetectionOutput) : f32 / [...] / [1,1,100,7] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 1932.50 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: frozen_inference_graph - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1 - [ INFO ] PERF_COUNT: False - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] GPU_HOST_TASK_PRIORITY: Priority.MEDIUM - [ INFO ] GPU_QUEUE_PRIORITY: Priority.MEDIUM - [ INFO ] GPU_QUEUE_THROTTLE: Priority.MEDIUM - [ INFO ] GPU_ENABLE_LOOP_UNROLLING: True - [ INFO ] CACHE_DIR: - [ INFO ] PERFORMANCE_HINT: PerformanceMode.LATENCY - [ INFO ] COMPILATION_NUM_THREADS: 20 - [ INFO ] NUM_STREAMS: 1 - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] DEVICE_ID: 0 - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'image_tensor'!. This input will be filled with random values! - [ INFO ] Fill input 'image_tensor' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 1 inference requests, limits: 60000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 6.17 ms - [Step 11/11] Dumping statistics report - [ INFO ] Count: 12710 iterations - [ INFO ] Duration: 60006.58 ms - [ INFO ] Latency: - [ INFO ] Median: 4.52 ms - [ INFO ] Average: 4.57 ms - [ INFO ] Min: 3.13 ms - [ INFO ] Max: 17.62 ms - [ INFO ] Throughput: 211.81 FPS - - -For completeness, let us list here some of the comparisons we may want -to do by varying the device and hint used. Note that the actual -performance may depend on the hardware used. Generally, we should expect -GPU to be better than CPU, whereas multiple GPUs should be better than a -single GPU as long as there is enough work for each of them. - -CPU vs GPU with Latency Hint -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -.. code:: ipython3 - - !benchmark_app -m {ov_model_path} -d CPU -hint latency - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3 - [ INFO ] - [ INFO ] Device info: - [ INFO ] CPU - [ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 30.38 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] image_tensor , image_tensor:0 (node: image_tensor) : u8 / [N,H,W,C] / [1,300,300,3] - [ INFO ] Model outputs: - [ INFO ] detection_boxes:0 (node: DetectionOutput) : f32 / [...] / [1,1,100,7] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] image_tensor , image_tensor:0 (node: image_tensor) : u8 / [N,H,W,C] / [1,300,300,3] - [ INFO ] Model outputs: - [ INFO ] detection_boxes:0 (node: DetectionOutput) : f32 / [...] / [1,1,100,7] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 127.72 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: frozen_inference_graph - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1 - [ INFO ] NUM_STREAMS: 1 - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] INFERENCE_NUM_THREADS: 14 - [ INFO ] PERF_COUNT: False - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] PERFORMANCE_HINT: PerformanceMode.LATENCY - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'image_tensor'!. This input will be filled with random values! - [ INFO ] Fill input 'image_tensor' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 1 inference requests, limits: 60000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 4.42 ms - [Step 11/11] Dumping statistics report - [ INFO ] Count: 15304 iterations - [ INFO ] Duration: 60005.72 ms - [ INFO ] Latency: - [ INFO ] Median: 3.87 ms - [ INFO ] Average: 3.88 ms - [ INFO ] Min: 3.49 ms - [ INFO ] Max: 5.95 ms - [ INFO ] Throughput: 255.04 FPS - - -.. code:: ipython3 - - !benchmark_app -m {model_path} -d GPU -hint latency - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3 - [ INFO ] - [ INFO ] Device info: - [ INFO ] GPU - [ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 14.65 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] image_tensor , image_tensor:0 (node: image_tensor) : u8 / [N,H,W,C] / [1,300,300,3] - [ INFO ] Model outputs: - [ INFO ] detection_boxes:0 (node: DetectionOutput) : f32 / [...] / [1,1,100,7] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] image_tensor , image_tensor:0 (node: image_tensor) : u8 / [N,H,W,C] / [1,300,300,3] - [ INFO ] Model outputs: - [ INFO ] detection_boxes:0 (node: DetectionOutput) : f32 / [...] / [1,1,100,7] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 2254.81 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: frozen_inference_graph - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1 - [ INFO ] PERF_COUNT: False - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] GPU_HOST_TASK_PRIORITY: Priority.MEDIUM - [ INFO ] GPU_QUEUE_PRIORITY: Priority.MEDIUM - [ INFO ] GPU_QUEUE_THROTTLE: Priority.MEDIUM - [ INFO ] GPU_ENABLE_LOOP_UNROLLING: True - [ INFO ] CACHE_DIR: - [ INFO ] PERFORMANCE_HINT: PerformanceMode.LATENCY - [ INFO ] COMPILATION_NUM_THREADS: 20 - [ INFO ] NUM_STREAMS: 1 - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] DEVICE_ID: 0 - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'image_tensor'!. This input will be filled with random values! - [ INFO ] Fill input 'image_tensor' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 1 inference requests, limits: 60000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 8.79 ms - [Step 11/11] Dumping statistics report - [ INFO ] Count: 11354 iterations - [ INFO ] Duration: 60007.21 ms - [ INFO ] Latency: - [ INFO ] Median: 4.57 ms - [ INFO ] Average: 5.16 ms - [ INFO ] Min: 3.18 ms - [ INFO ] Max: 34.87 ms - [ INFO ] Throughput: 189.21 FPS - - -CPU vs GPU with Throughput Hint -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -.. code:: ipython3 - - !benchmark_app -m {model_path} -d CPU -hint throughput - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3 - [ INFO ] - [ INFO ] Device info: - [ INFO ] CPU - [ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 29.56 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] image_tensor:0 , image_tensor (node: image_tensor) : u8 / [N,H,W,C] / [1,300,300,3] - [ INFO ] Model outputs: - [ INFO ] detection_boxes:0 (node: DetectionOutput) : f32 / [...] / [1,1,100,7] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] image_tensor:0 , image_tensor (node: image_tensor) : u8 / [N,H,W,C] / [1,300,300,3] - [ INFO ] Model outputs: - [ INFO ] detection_boxes:0 (node: DetectionOutput) : f32 / [...] / [1,1,100,7] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 158.91 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: frozen_inference_graph - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 5 - [ INFO ] NUM_STREAMS: 5 - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] INFERENCE_NUM_THREADS: 20 - [ INFO ] PERF_COUNT: False - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'image_tensor'!. This input will be filled with random values! - [ INFO ] Fill input 'image_tensor' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 5 inference requests, limits: 60000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 8.15 ms - [Step 11/11] Dumping statistics report - [ INFO ] Count: 25240 iterations - [ INFO ] Duration: 60010.99 ms - [ INFO ] Latency: - [ INFO ] Median: 10.16 ms - [ INFO ] Average: 11.84 ms - [ INFO ] Min: 7.96 ms - [ INFO ] Max: 37.53 ms - [ INFO ] Throughput: 420.59 FPS - - -.. code:: ipython3 - - !benchmark_app -m {model_path} -d GPU -hint throughput - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3 - [ INFO ] - [ INFO ] Device info: - [ INFO ] GPU - [ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 15.45 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] image_tensor , image_tensor:0 (node: image_tensor) : u8 / [N,H,W,C] / [1,300,300,3] - [ INFO ] Model outputs: - [ INFO ] detection_boxes:0 (node: DetectionOutput) : f32 / [...] / [1,1,100,7] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] image_tensor , image_tensor:0 (node: image_tensor) : u8 / [N,H,W,C] / [1,300,300,3] - [ INFO ] Model outputs: - [ INFO ] detection_boxes:0 (node: DetectionOutput) : f32 / [...] / [1,1,100,7] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 2249.04 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: frozen_inference_graph - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 4 - [ INFO ] PERF_COUNT: False - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] GPU_HOST_TASK_PRIORITY: Priority.MEDIUM - [ INFO ] GPU_QUEUE_PRIORITY: Priority.MEDIUM - [ INFO ] GPU_QUEUE_THROTTLE: Priority.MEDIUM - [ INFO ] GPU_ENABLE_LOOP_UNROLLING: True - [ INFO ] CACHE_DIR: - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] COMPILATION_NUM_THREADS: 20 - [ INFO ] NUM_STREAMS: 2 - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] DEVICE_ID: 0 - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'image_tensor'!. This input will be filled with random values! - [ INFO ] Fill input 'image_tensor' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 4 inference requests, limits: 60000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 9.17 ms - [Step 11/11] Dumping statistics report - [ INFO ] Count: 19588 iterations - [ INFO ] Duration: 60023.47 ms - [ INFO ] Latency: - [ INFO ] Median: 11.31 ms - [ INFO ] Average: 12.15 ms - [ INFO ] Min: 9.26 ms - [ INFO ] Max: 36.04 ms - [ INFO ] Throughput: 326.34 FPS - - -Single GPU vs Multiple GPUs -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -.. code:: ipython3 - - !benchmark_app -m {model_path} -d GPU.1 -hint throughput - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3 - [ INFO ] - [ INFO ] Device info: - [ INFO ] GPU - [ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Device GPU.1 does not support performance hint property(-hint). - [ ERROR ] Config for device with 1 ID is not registered in GPU plugin - Traceback (most recent call last): - File "/home/adrian/repos/openvino_notebooks/venv/lib/python3.9/site-packages/openvino/tools/benchmark/main.py", line 329, in main - benchmark.set_config(config) - File "/home/adrian/repos/openvino_notebooks/venv/lib/python3.9/site-packages/openvino/tools/benchmark/benchmark.py", line 57, in set_config - self.core.set_property(device, config[device]) - RuntimeError: Config for device with 1 ID is not registered in GPU plugin - - -.. code:: ipython3 - - !benchmark_app -m {model_path} -d AUTO:GPU.1,GPU.0 -hint cumulative_throughput - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3 - [ INFO ] GPU - [ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Device GPU.1 does not support performance hint property(-hint). - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 26.66 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] image_tensor , image_tensor:0 (node: image_tensor) : u8 / [N,H,W,C] / [1,300,300,3] - [ INFO ] Model outputs: - [ INFO ] detection_boxes:0 (node: DetectionOutput) : f32 / [...] / [1,1,100,7] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] image_tensor , image_tensor:0 (node: image_tensor) : u8 / [N,H,W,C] / [1,300,300,3] - [ INFO ] Model outputs: - [ INFO ] detection_boxes:0 (node: DetectionOutput) : f32 / [...] / [1,1,100,7] - [Step 7/11] Loading the model to the device - [ ERROR ] Config for device with 1 ID is not registered in GPU plugin - Traceback (most recent call last): - File "/home/adrian/repos/openvino_notebooks/venv/lib/python3.9/site-packages/openvino/tools/benchmark/main.py", line 414, in main - compiled_model = benchmark.core.compile_model(model, benchmark.device) - File "/home/adrian/repos/openvino_notebooks/venv/lib/python3.9/site-packages/openvino/runtime/ie_api.py", line 399, in compile_model - super().compile_model(model, device_name, {} if config is None else config), - RuntimeError: Config for device with 1 ID is not registered in GPU plugin - - -.. code:: ipython3 - - !benchmark_app -m {model_path} -d MULTI:GPU.1,GPU.0 -hint throughput - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3 - [ INFO ] - [ INFO ] Device info: - [ INFO ] GPU - [ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3 - [ INFO ] MULTI - [ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Device GPU.1 does not support performance hint property(-hint). - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 14.84 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] image_tensor:0 , image_tensor (node: image_tensor) : u8 / [N,H,W,C] / [1,300,300,3] - [ INFO ] Model outputs: - [ INFO ] detection_boxes:0 (node: DetectionOutput) : f32 / [...] / [1,1,100,7] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] image_tensor:0 , image_tensor (node: image_tensor) : u8 / [N,H,W,C] / [1,300,300,3] - [ INFO ] Model outputs: - [ INFO ] detection_boxes:0 (node: DetectionOutput) : f32 / [...] / [1,1,100,7] - [Step 7/11] Loading the model to the device - [ ERROR ] Config for device with 1 ID is not registered in GPU plugin - Traceback (most recent call last): - File "/home/adrian/repos/openvino_notebooks/venv/lib/python3.9/site-packages/openvino/tools/benchmark/main.py", line 414, in main - compiled_model = benchmark.core.compile_model(model, benchmark.device) - File "/home/adrian/repos/openvino_notebooks/venv/lib/python3.9/site-packages/openvino/runtime/ie_api.py", line 399, in compile_model - super().compile_model(model, device_name, {} if config is None else config), - RuntimeError: Config for device with 1 ID is not registered in GPU plugin - - -Basic Application Using GPUs ----------------------------- - - - -We will now show an end-to-end object detection example using GPUs in -OpenVINO. The application compiles a model on GPU with the “THROUGHPUT” -hint, then loads a video and preprocesses every frame to convert them to -the shape expected by the model. Once the frames are loaded, it sets up -an asynchronous pipeline, performs inference and saves the detections -found in each frame. The detections are then drawn on their -corresponding frame and saved as a video, which is displayed at the end -of the application. - -Import Necessary Packages -~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - import time - from pathlib import Path - - import cv2 - import numpy as np - from IPython.display import Video - import openvino as ov - - # Instantiate OpenVINO Runtime - core = ov.Core() - core.available_devices - - - - -.. parsed-literal:: - - ['CPU', 'GPU'] - - - -Compile the Model -~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # Read model and compile it on GPU in THROUGHPUT mode - model = core.read_model(model=ov_model_path) - device_name = "GPU" - compiled_model = core.compile_model(model=model, device_name=device_name, config={hints.performance_mode(): hints.PerformanceMode.THROUGHPUT}) - - # Get the input and output nodes - input_layer = compiled_model.input(0) - output_layer = compiled_model.output(0) - - # Get the input size - num, height, width, channels = input_layer.shape - print("Model input shape:", num, height, width, channels) - - -.. parsed-literal:: - - Model input shape: 1 300 300 3 - - -Load and Preprocess Video Frames -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # Load video - video_file = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/Coco%20Walking%20in%20Berkeley.mp4" - video = cv2.VideoCapture(video_file) - framebuf = [] - - # Go through every frame of video and resize it - print("Loading video...") - while video.isOpened(): - ret, frame = video.read() - if not ret: - print("Video loaded!") - video.release() - break - - # Preprocess frames - convert them to shape expected by model - input_frame = cv2.resize(src=frame, dsize=(width, height), interpolation=cv2.INTER_AREA) - input_frame = np.expand_dims(input_frame, axis=0) - - # Append frame to framebuffer - framebuf.append(input_frame) - - - print("Frame shape: ", framebuf[0].shape) - print("Number of frames: ", len(framebuf)) - - # Show original video file - # If the video does not display correctly inside the notebook, please open it with your favorite media player - Video(video_file) - - -.. parsed-literal:: - - Loading video... - Video loaded! - Frame shape: (1, 300, 300, 3) - Number of frames: 288 - - -Define Model Output Classes -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # Define the model's labelmap (this model uses COCO classes) - classes = [ - "background", - "person", - "bicycle", - "car", - "motorcycle", - "airplane", - "bus", - "train", - "truck", - "boat", - "traffic light", - "fire hydrant", - "street sign", - "stop sign", - "parking meter", - "bench", - "bird", - "cat", - "dog", - "horse", - "sheep", - "cow", - "elephant", - "bear", - "zebra", - "giraffe", - "hat", - "backpack", - "umbrella", - "shoe", - "eye glasses", - "handbag", - "tie", - "suitcase", - "frisbee", - "skis", - "snowboard", - "sports ball", - "kite", - "baseball bat", - "baseball glove", - "skateboard", - "surfboard", - "tennis racket", - "bottle", - "plate", - "wine glass", - "cup", - "fork", - "knife", - "spoon", - "bowl", - "banana", - "apple", - "sandwich", - "orange", - "broccoli", - "carrot", - "hot dog", - "pizza", - "donut", - "cake", - "chair", - "couch", - "potted plant", - "bed", - "mirror", - "dining table", - "window", - "desk", - "toilet", - "door", - "tv", - "laptop", - "mouse", - "remote", - "keyboard", - "cell phone", - "microwave", - "oven", - "toaster", - "sink", - "refrigerator", - "blender", - "book", - "clock", - "vase", - "scissors", - "teddy bear", - "hair drier", - "toothbrush", - "hair brush", - ] - -Set up Asynchronous Pipeline -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Callback Definition -^^^^^^^^^^^^^^^^^^^ - - - -.. code:: ipython3 - - # Define a callback function that runs every time the asynchronous pipeline completes inference on a frame - def completion_callback(infer_request: ov.InferRequest, frame_id: int) -> None: - global frame_number - stop_time = time.time() - frame_number += 1 - - predictions = next(iter(infer_request.results.values())) - results[frame_id] = predictions[:10] # Grab first 10 predictions for this frame - - total_time = stop_time - start_time - frame_fps[frame_id] = frame_number / total_time - -Create Async Pipeline -^^^^^^^^^^^^^^^^^^^^^ - - - -.. code:: ipython3 - - # Create asynchronous inference queue with optimal number of infer requests - infer_queue = ov.AsyncInferQueue(compiled_model) - infer_queue.set_callback(completion_callback) - -Perform Inference -~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # Perform inference on every frame in the framebuffer - results = {} - frame_fps = {} - frame_number = 0 - start_time = time.time() - for i, input_frame in enumerate(framebuf): - infer_queue.start_async({0: input_frame}, i) - - infer_queue.wait_all() # Wait until all inference requests in the AsyncInferQueue are completed - stop_time = time.time() - - # Calculate total inference time and FPS - total_time = stop_time - start_time - fps = len(framebuf) / total_time - time_per_frame = 1 / fps - print(f"Total time to infer all frames: {total_time:.3f}s") - print(f"Time per frame: {time_per_frame:.6f}s ({fps:.3f} FPS)") - - -.. parsed-literal:: - - Total time to infer all frames: 1.366s - Time per frame: 0.004744s (210.774 FPS) - - -Process Results -~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # Set minimum detection threshold - min_thresh = 0.6 - - # Load video - video = cv2.VideoCapture(video_file) - - # Get video parameters - frame_width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) - frame_height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) - fps = int(video.get(cv2.CAP_PROP_FPS)) - fourcc = int(video.get(cv2.CAP_PROP_FOURCC)) - - # Create folder and VideoWriter to save output video - Path("./output").mkdir(exist_ok=True) - output = cv2.VideoWriter("output/output.mp4", fourcc, fps, (frame_width, frame_height)) - - # Draw detection results on every frame of video and save as a new video file - while video.isOpened(): - current_frame = int(video.get(cv2.CAP_PROP_POS_FRAMES)) - ret, frame = video.read() - if not ret: - print("Video loaded!") - output.release() - video.release() - break - - # Draw info at the top left such as current fps, the devices and the performance hint being used - cv2.putText( - frame, - f"fps {str(round(frame_fps[current_frame], 2))}", - (5, 20), - cv2.FONT_ITALIC, - 0.6, - (0, 0, 0), - 1, - cv2.LINE_AA, - ) - cv2.putText( - frame, - f"device {device_name}", - (5, 40), - cv2.FONT_ITALIC, - 0.6, - (0, 0, 0), - 1, - cv2.LINE_AA, - ) - cv2.putText( - frame, - f"hint {compiled_model.get_property(hints.performance_mode)}", - (5, 60), - cv2.FONT_ITALIC, - 0.6, - (0, 0, 0), - 1, - cv2.LINE_AA, - ) - - # prediction contains [image_id, label, conf, x_min, y_min, x_max, y_max] according to model - for prediction in np.squeeze(results[current_frame]): - if prediction[2] > min_thresh: - x_min = int(prediction[3] * frame_width) - y_min = int(prediction[4] * frame_height) - x_max = int(prediction[5] * frame_width) - y_max = int(prediction[6] * frame_height) - label = classes[int(prediction[1])] - - # Draw a bounding box with its label above it - cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 1, cv2.LINE_AA) - cv2.putText( - frame, - label, - (x_min, y_min - 10), - cv2.FONT_ITALIC, - 1, - (255, 0, 0), - 1, - cv2.LINE_AA, - ) - - output.write(frame) - - # Show output video file - # If the video does not display correctly inside the notebook, please open it with your favorite media player - Video("output/output.mp4", width=800, embed=True) - - -.. parsed-literal:: - - Video loaded! - - - - -.. raw:: html - - - - - -Conclusion ----------- - - - -This tutorial demonstrates how easy it is to use one or more GPUs in -OpenVINO, check their properties, and even tailor the model performance -through the different performance hints. It also provides a walk-through -of a basic object detection application that uses a GPU and displays the -detected bounding boxes. - -To read more about any of these topics, feel free to visit their -corresponding documentation: - -- `GPU - Plugin `__ -- `AUTO - Plugin `__ -- `Model - Caching `__ -- `MULTI Device - Mode `__ -- `Query Device - Properties `__ -- `Configurations for GPUs with - OpenVINO `__ -- `Benchmark Python - Tool `__ -- `Asynchronous - Inferencing `__ diff --git a/docs/notebooks/grammar-correction-with-output.rst b/docs/notebooks/grammar-correction-with-output.rst deleted file mode 100644 index a70244164b6e7e..00000000000000 --- a/docs/notebooks/grammar-correction-with-output.rst +++ /dev/null @@ -1,826 +0,0 @@ -Grammatical Error Correction with OpenVINO -========================================== - -AI-based auto-correction products are becoming increasingly popular due -to their ease of use, editing speed, and affordability. These products -improve the quality of written text in emails, blogs, and chats. - -Grammatical Error Correction (GEC) is the task of correcting different -types of errors in text such as spelling, punctuation, grammatical and -word choice errors. GEC is typically formulated as a sentence correction -task. A GEC system takes a potentially erroneous sentence as input and -is expected to transform it into a more correct version. See the example -given below: - -=========================== ========================== -Input (Erroneous) Output (Corrected) -=========================== ========================== -I like to rides my bicycle. I like to ride my bicycle. -=========================== ========================== - -As shown in the image below, different types of errors in written -language can be corrected. - -.. figure:: https://cdn-images-1.medium.com/max/540/1*Voez5hEn5MU8Knde3fIZfw.png - :alt: error_types - - error_types - -This tutorial shows how to perform grammatical error correction using -OpenVINO. We will use pre-trained models from the `Hugging Face -Transformers `__ -library. To simplify the user experience, the `Hugging Face -Optimum `__ library is used to -convert the models to OpenVINO™ IR format. - -It consists of the following steps: - -- Install prerequisites -- Download and convert models from a public source using the `OpenVINO - integration with Hugging Face - Optimum `__. -- Create an inference pipeline for grammatical error checking -- Optimize grammar correction pipeline with - `NNCF `__ quantization -- Compare original and optimized pipelines from performance and - accuracy standpoints - - -**Table of contents:** - - -- `How does it work? <#how-does-it-work>`__ -- `Prerequisites <#prerequisites>`__ -- `Download and Convert Models <#download-and-convert-models>`__ - - - `Select inference device <#select-inference-device>`__ - - `Grammar Checker <#grammar-checker>`__ - - `Grammar Corrector <#grammar-corrector>`__ - -- `Prepare Demo Pipeline <#prepare-demo-pipeline>`__ -- `Quantization <#quantization>`__ - - - `Run Quantization <#run-quantization>`__ - - `Compare model size, performance and - accuracy <#compare-model-size-performance-and-accuracy>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -How does it work? ------------------ - - - -A Grammatical Error Correction task can be thought of as a -sequence-to-sequence task where a model is trained to take a -grammatically incorrect sentence as input and return a grammatically -correct sentence as output. We will use the -`FLAN-T5 `__ -model finetuned on an expanded version of the -`JFLEG `__ dataset. - -The version of FLAN-T5 released with the `Scaling Instruction-Finetuned -Language Models `__ paper is an -enhanced version of `T5 `__ that has -been finetuned on a combination of tasks. The paper explores instruction -finetuning with a particular focus on scaling the number of tasks, -scaling the model size, and finetuning on chain-of-thought data. The -paper discovers that overall instruction finetuning is a general method -that improves the performance and usability of pre-trained language -models. - -.. figure:: https://production-media.paperswithcode.com/methods/a04cb14e-e6b8-449e-9487-bc4262911d74.png - :alt: flan-t5_training - - flan-t5_training - -For more details about the model, please check out -`paper `__, original -`repository `__, and Hugging -Face `model card `__ - -Additionally, to reduce the number of sentences required to be -processed, you can perform grammatical correctness checking. This task -should be considered as a simple binary text classification, where the -model gets input text and predicts label 1 if a text contains any -grammatical errors and 0 if it does not. You will use the -`roberta-base-CoLA `__ -model, the RoBERTa Base model finetuned on the CoLA dataset. The RoBERTa -model was proposed in `RoBERTa: A Robustly Optimized BERT Pretraining -Approach paper `__. It builds on BERT -and modifies key hyperparameters, removing the next-sentence -pre-training objective and training with much larger mini-batches and -learning rates. Additional details about the model can be found in a -`blog -post `__ -by Meta AI and in the `Hugging Face -documentation `__ - -Now that we know more about FLAN-T5 and RoBERTa, let us get started. 🚀 - -Prerequisites -------------- - - - -First, we need to install the `Hugging Face -Optimum `__ library -accelerated by OpenVINO integration. The Hugging Face Optimum API is a -high-level API that enables us to convert and quantize models from the -Hugging Face Transformers library to the OpenVINO™ IR format. For more -details, refer to the `Hugging Face Optimum -documentation `__. - -.. code:: ipython3 - - import platform - - %pip install -q "torch>=2.1.0" "git+https://github.com/huggingface/optimum-intel.git" "openvino>=2024.0.0" "onnx<1.16.2" tqdm "gradio>=4.19" "transformers>=4.33.0" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "nncf>=2.9.0" datasets jiwer - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - -Download and Convert Models ---------------------------- - - - -Optimum Intel can be used to load optimized models from the `Hugging -Face Hub `__ and -create pipelines to run an inference with OpenVINO Runtime using Hugging -Face APIs. The Optimum Inference models are API compatible with Hugging -Face Transformers models. This means we just need to replace -``AutoModelForXxx`` class with the corresponding ``OVModelForXxx`` -class. - -Below is an example of the RoBERTa text classification model - -.. code:: diff - - -from transformers import AutoModelForSequenceClassification - +from optimum.intel.openvino import OVModelForSequenceClassification - from transformers import AutoTokenizer, pipeline - - model_id = "textattack/roberta-base-CoLA" - -model = AutoModelForSequenceClassification.from_pretrained(model_id) - +model = OVModelForSequenceClassification.from_pretrained(model_id, from_transformers=True) - -Model class initialization starts with calling ``from_pretrained`` -method. When downloading and converting Transformers model, the -parameter ``from_transformers=True`` should be added. We can save the -converted model for the next usage with the ``save_pretrained`` method. -Tokenizer class and pipelines API are compatible with Optimum models. - -.. code:: ipython3 - - from transformers import pipeline, AutoTokenizer - from optimum.intel.openvino import OVModelForSeq2SeqLM, OVModelForSequenceClassification - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("grammar-correction.ipynb") - - -.. parsed-literal:: - - 2024-03-25 11:56:04.043628: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-03-25 11:56:04.045940: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used. - 2024-03-25 11:56:04.079112: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered - 2024-03-25 11:56:04.079147: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered - 2024-03-25 11:56:04.079167: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered - 2024-03-25 11:56:04.085243: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used. - 2024-03-25 11:56:04.085971: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-03-25 11:56:05.314633: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget(default="CPU", exclude=["AUTO", "GPU"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO') - - - -Grammar Checker -~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - grammar_checker_model_id = "textattack/roberta-base-CoLA" - grammar_checker_dir = Path("roberta-base-cola") - grammar_checker_tokenizer = AutoTokenizer.from_pretrained(grammar_checker_model_id) - - if grammar_checker_dir.exists(): - grammar_checker_model = OVModelForSequenceClassification.from_pretrained(grammar_checker_dir, device=device.value) - else: - grammar_checker_model = OVModelForSequenceClassification.from_pretrained(grammar_checker_model_id, export=True, device=device.value, load_in_8bit=False) - grammar_checker_model.save_pretrained(grammar_checker_dir) - - -.. parsed-literal:: - - Framework not specified. Using pt to export the model. - Some weights of the model checkpoint at textattack/roberta-base-CoLA were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight'] - - This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - - This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). - Using the export variant default. Available variants are: - - default: The default ONNX variant. - Using framework PyTorch: 2.2.1+cpu - Overriding 1 configuration item(s) - - use_cache -> False - /home/ea/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py:4225: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead - warnings.warn( - Compiling the model to AUTO ... - - -Let us check model work, using inference pipeline for -``text-classification`` task. You can find more information about usage -Hugging Face inference pipelines in this -`tutorial `__ - -.. code:: ipython3 - - import torch - - input_text = "They are moved by salar energy" - grammar_checker_pipe = pipeline("text-classification", model=grammar_checker_model, tokenizer=grammar_checker_tokenizer, device=torch.device("cpu")) - result = grammar_checker_pipe(input_text)[0] - print(f"input text: {input_text}") - print(f'predicted label: {"contains_errors" if result["label"] == "LABEL_1" else "no errors"}') - print(f'predicted score: {result["score"] :.2}') - - -.. parsed-literal:: - - input text: They are moved by salar energy - predicted label: contains_errors - predicted score: 0.88 - - -Great! Looks like the model can detect errors in the sample. - -Grammar Corrector -~~~~~~~~~~~~~~~~~ - - - -The steps for loading the Grammar Corrector model are very similar, -except for the model class that is used. Because FLAN-T5 is a -sequence-to-sequence text generation model, we should use the -``OVModelForSeq2SeqLM`` class and the ``text2text-generation`` pipeline -to run it. - -.. code:: ipython3 - - grammar_corrector_model_id = "pszemraj/flan-t5-large-grammar-synthesis" - grammar_corrector_dir = Path("flan-t5-large-grammar-synthesis") - grammar_corrector_tokenizer = AutoTokenizer.from_pretrained(grammar_corrector_model_id) - - if grammar_corrector_dir.exists(): - grammar_corrector_model = OVModelForSeq2SeqLM.from_pretrained(grammar_corrector_dir, device=device.value) - else: - grammar_corrector_model = OVModelForSeq2SeqLM.from_pretrained(grammar_corrector_model_id, export=True, device=device.value) - grammar_corrector_model.save_pretrained(grammar_corrector_dir) - - -.. parsed-literal:: - - Framework not specified. Using pt to export the model. - Using the export variant default. Available variants are: - - default: The default ONNX variant. - Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41. - Non-default generation parameters: {'max_length': 512, 'min_length': 8, 'num_beams': 2, 'no_repeat_ngram_size': 4} - Using framework PyTorch: 2.2.1+cpu - Overriding 1 configuration item(s) - - use_cache -> False - /home/ea/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py:4225: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead - warnings.warn( - Using framework PyTorch: 2.2.1+cpu - Overriding 1 configuration item(s) - - use_cache -> True - /home/ea/miniconda3/lib/python3.11/site-packages/transformers/modeling_utils.py:943: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if causal_mask.shape[1] < attention_mask.shape[1]: - Using framework PyTorch: 2.2.1+cpu - Overriding 1 configuration item(s) - - use_cache -> True - /home/ea/miniconda3/lib/python3.11/site-packages/transformers/models/t5/modeling_t5.py:509: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - elif past_key_value.shape[2] != key_value_states.shape[1]: - Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41. - Non-default generation parameters: {'max_length': 512, 'min_length': 8, 'num_beams': 2, 'no_repeat_ngram_size': 4} - Compiling the encoder to AUTO ... - Compiling the decoder to AUTO ... - Compiling the decoder to AUTO ... - Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41. - Non-default generation parameters: {'max_length': 512, 'min_length': 8, 'num_beams': 2, 'no_repeat_ngram_size': 4} - - -.. code:: ipython3 - - grammar_corrector_pipe = pipeline("text2text-generation", model=grammar_corrector_model, tokenizer=grammar_corrector_tokenizer, device=torch.device("cpu")) - -.. code:: ipython3 - - result = grammar_corrector_pipe(input_text)[0] - print(f"input text: {input_text}") - print(f'generated text: {result["generated_text"]}') - - -.. parsed-literal:: - - input text: They are moved by salar energy - generated text: They are powered by solar energy. - - -Nice! The result looks pretty good! - -Prepare Demo Pipeline ---------------------- - - - -Now let us put everything together and create the pipeline for grammar -correction. The pipeline accepts input text, verifies its correctness, -and generates the correct version if required. It will consist of -several steps: - -1. Split text on sentences. -2. Check grammatical correctness for each sentence using Grammar - Checker. -3. Generate an improved version of the sentence if required. - -.. code:: ipython3 - - import re - import transformers - from tqdm.notebook import tqdm - - - def split_text(text: str) -> list: - """ - Split a string of text into a list of sentence batches. - - Parameters: - text (str): The text to be split into sentence batches. - - Returns: - list: A list of sentence batches. Each sentence batch is a list of sentences. - """ - # Split the text into sentences using regex - sentences = re.split(r"(?<=[^A-Z].[.?]) +(?=[A-Z])", text) - - # Initialize a list to store the sentence batches - sentence_batches = [] - - # Initialize a temporary list to store the current batch of sentences - temp_batch = [] - - # Iterate through the sentences - for sentence in sentences: - # Add the sentence to the temporary batch - temp_batch.append(sentence) - - # If the length of the temporary batch is between 2 and 3 sentences, or if it is the last batch, add it to the list of sentence batches - if len(temp_batch) >= 2 and len(temp_batch) <= 3 or sentence == sentences[-1]: - sentence_batches.append(temp_batch) - temp_batch = [] - - return sentence_batches - - - def correct_text( - text: str, - checker: transformers.pipelines.Pipeline, - corrector: transformers.pipelines.Pipeline, - separator: str = " ", - ) -> str: - """ - Correct the grammar in a string of text using a text-classification and text-generation pipeline. - - Parameters: - text (str): The inpur text to be corrected. - checker (transformers.pipelines.Pipeline): The text-classification pipeline to use for checking the grammar quality of the text. - corrector (transformers.pipelines.Pipeline): The text-generation pipeline to use for correcting the text. - separator (str, optional): The separator to use when joining the corrected text into a single string. Default is a space character. - - Returns: - str: The corrected text. - """ - # Split the text into sentence batches - sentence_batches = split_text(text) - - # Initialize a list to store the corrected text - corrected_text = [] - - # Iterate through the sentence batches - for batch in tqdm(sentence_batches, total=len(sentence_batches), desc="correcting text.."): - # Join the sentences in the batch into a single string - raw_text = " ".join(batch) - - # Check the grammar quality of the text using the text-classification pipeline - results = checker(raw_text) - - # Only correct the text if the results of the text-classification are not LABEL_1 or are LABEL_1 with a score below 0.9 - if results[0]["label"] != "LABEL_1" or (results[0]["label"] == "LABEL_1" and results[0]["score"] < 0.9): - # Correct the text using the text-generation pipeline - corrected_batch = corrector(raw_text) - corrected_text.append(corrected_batch[0]["generated_text"]) - else: - corrected_text.append(raw_text) - - # Join the corrected text into a single string - corrected_text = separator.join(corrected_text) - - return corrected_text - -Let us see it in action. - -.. code:: ipython3 - - default_text = ( - "Most of the course is about semantic or content of language but there are also interesting" - " topics to be learned from the servicefeatures except statistics in characters in documents.At" - " this point, He introduces herself as his native English speaker and goes on to say that if" - " you contine to work on social scnce" - ) - - corrected_text = correct_text(default_text, grammar_checker_pipe, grammar_corrector_pipe) - - - -.. parsed-literal:: - - correcting text..: 0%| | 0/1 [00:00`__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -Grammar checker model takes up a tiny portion of the whole text -correction pipeline so we optimize only the grammar corrector model. -Grammar corrector itself consists of three models: encoder, first call -decoder and decoder with past. The last model’s share of inference -dominates the other ones. Because of this we quantize only it. - -The optimization process contains the following steps: - -1. Create a calibration dataset for quantization. -2. Run ``nncf.quantize()`` to obtain quantized models. -3. Serialize the ``INT8`` model using ``openvino.save_model()`` - function. - -Please select below whether you would like to run quantization to -improve model inference speed. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - to_quantize = quantization_widget() - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -Run Quantization -~~~~~~~~~~~~~~~~ - - - -Below we retrieve the quantized model. Please see ``utils.py`` for -source code. Quantization is relatively time-consuming and will take -some time to complete. - -.. code:: ipython3 - - from utils import get_quantized_pipeline, CALIBRATION_DATASET_SIZE - import openvino as ov - - grammar_corrector_pipe_fp32 = grammar_corrector_pipe - grammar_corrector_pipe_int8 = None - if to_quantize.value: - quantized_model_path = Path("quantized_decodet") / "openvino_model.xml" - grammar_corrector_pipe_int8 = get_quantized_pipeline( - grammar_corrector_pipe_fp32, - grammar_corrector_tokenizer, - ov.Core(), - grammar_corrector_dir, - quantized_model_path, - device.value, - calibration_dataset_size=CALIBRATION_DATASET_SIZE, - ) - - - -.. parsed-literal:: - - Downloading readme: 0%| | 0.00/5.94k [00:00`__\ -dataset is used for testing. One dataset sample consists of a text with -errors as input and several corrected versions as labels. When measuring -accuracy we use mean ``(1 - WER)`` against corrected text versions, -where WER is Word Error Rate metric. - -.. code:: ipython3 - - from utils import calculate_inference_time_and_accuracy - - TEST_SUBSET_SIZE = 50 - - if to_quantize.value: - inference_time_fp32, accuracy_fp32 = calculate_inference_time_and_accuracy(grammar_corrector_pipe_fp32, TEST_SUBSET_SIZE) - print(f"Evaluation results of FP32 grammar correction pipeline. Accuracy: {accuracy_fp32:.2f}%. Time: {inference_time_fp32:.2f} sec.") - inference_time_int8, accuracy_int8 = calculate_inference_time_and_accuracy(grammar_corrector_pipe_int8, TEST_SUBSET_SIZE) - print(f"Evaluation results of INT8 grammar correction pipeline. Accuracy: {accuracy_int8:.2f}%. Time: {inference_time_int8:.2f} sec.") - print(f"Performance speedup: {inference_time_fp32 / inference_time_int8:.3f}") - print(f"Accuracy drop :{accuracy_fp32 - accuracy_int8:.2f}%.") - print(f"Model footprint reduction: {model_size_fp32 / model_size_int8:.3f}") - - - -.. parsed-literal:: - - Evaluation: 0%| | 0/50 [00:00`__ -on Intel® platforms. - -GroundedSAM aims to detect and segment anything with text inputs. -GroundingDINO is a language-guided query selection module to enhance -object detection using input text. It selects relevant features from -image and text inputs and returns predicted boxes with detections. The -Segment Anything Model (SAM) produces high quality object masks from -input prompts such as points or boxes, and it can be used to generate -masks for all objects in an image. We use box predictions from -GroundingDINO to mask the original image. - -More details about the model can be found in the -`paper `__, and the official -`repository `__. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/5703039/3c19063a-c60a-4d5d-b534-e1305a854180 - :alt: image - - image - - -**Table of contents:** - - -- `Clone repository and install - requirements <#clone-repository-and-install-requirements>`__ -- `Download checkpoints and load PyTorch - model <#download-checkpoints-and-load-pytorch-model>`__ -- `Convert GroundingDINO to OpenVINO IR - format <#convert-groundingdino-to-openvino-ir-format>`__ -- `Run OpenVINO optimized - GroundingDINO <#run-openvino-optimized-groundingdino>`__ -- `Convert SAM to OpenVINO IR <#convert-sam-to-openvino-ir>`__ -- `Combine GroundingDINO + SAM - (GroundedSAM) <#combine-groundingdino--sam-groundedsam>`__ -- `Interactive GroundedSAM <#interactive-groundedsam>`__ -- `Cleanup <#cleanup>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Clone repositories and install requirements -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %pip install -q "openvino>=2024.0" "torch>=2.1" opencv-python "Pillow>=10.0" "supervision[desktop]>=0.22" transformers yapf pycocotools addict "gradio>=4.19" tqdm timm --extra-index-url https://download.pytorch.org/whl/cpu - - -.. parsed-literal:: - - WARNING: supervision 0.25.1 does not provide the extra 'desktop' - Note: you may need to restart the kernel to use updated packages. - - -For faster computation and to limit RAM by default we use -``EfficientSAM`` for segmentation, but if you wish more accurate -segmentation you can select vanilla ``SAM``. - -.. code:: ipython3 - - import ipywidgets as widgets - - sam_type_widget = widgets.Dropdown( - options=["EfficientSAM", "SAM"], - value="EfficientSAM", - description="Segment Anything type:", - ) - sam_type_widget - - - - -.. parsed-literal:: - - Dropdown(description='Segment Anything type:', options=('EfficientSAM', 'SAM'), value='EfficientSAM') - - - -.. code:: ipython3 - - use_efficient_sam = sam_type_widget.value == "EfficientSAM" - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("grounded-segment-anything.ipynb") - -.. code:: ipython3 - - from pathlib import Path - import sys - import os - - from cmd_helper import clone_repo - - - repo_dir = Path("Grounded-Segment-Anything") - ground_dino_dir = Path("GroundingDINO") - efficient_sam_dir = Path("EfficientSAM") - - # we use grounding dino from a fork which contains modifications that allow conversion to OpenVINO IR - clone_repo("https://github.com/wenyi5608/GroundingDINO.git") - - if use_efficient_sam: - clone_repo("https://github.com/yformer/EfficientSAM.git") - if not use_efficient_sam: - clone_repo("https://github.com/IDEA-Research/Grounded-Segment-Anything.git", add_to_sys_path=False) - sys.path.append(repo_dir / "segment_anything") - -.. code:: ipython3 - - import torch - import numpy as np - import supervision as sv - import openvino as ov - from PIL import Image, ImageDraw, ImageFont - from typing import Union, List - import transformers - - core = ov.Core() - -Download checkpoints and load PyTorch models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - IRS_PATH = Path("openvino_irs") - CKPT_BASE_PATH = Path("checkpoints") - os.makedirs(IRS_PATH, exist_ok=True) - os.makedirs(CKPT_BASE_PATH, exist_ok=True) - - PT_DEVICE = "cpu" - ov_dino_name = "openvino_grounding_dino" - ov_sam_name = "openvino_segment_anything" - - ground_dino_img_size = (1024, 1280) - - # GroundingDINO config and checkpoint - GROUNDING_DINO_CONFIG_PATH = f"{ground_dino_dir}/groundingdino/config/GroundingDINO_SwinT_OGC.py" - GROUNDING_DINO_CHECKPOINT_PATH = CKPT_BASE_PATH / "groundingdino_swint_ogc.pth" - - # Segment Anything checkpoint - SAM_CHECKPOINT_PATH = CKPT_BASE_PATH / "sam_vit_h_4b8939.pth" - - # Efficient Segment Anything checkpoint - EFFICIENT_SAM_CHECKPOINT_PATH = efficient_sam_dir / "weights/efficient_sam_vitt.pt" - -.. code:: ipython3 - - from notebook_utils import download_file, device_widget - - - if not (CKPT_BASE_PATH / "groundingdino_swint_ogc.pth").exists(): - download_file( - "https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth", - directory=CKPT_BASE_PATH, - ) - if not use_efficient_sam and not (CKPT_BASE_PATH / "sam_vit_h_4b8939.pth").exists(): - download_file( - "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth", - directory=CKPT_BASE_PATH, - ) - - - -.. parsed-literal:: - - groundingdino_swint_ogc.pth: 0%| | 0.00/662M [00:00 torch.Tensor: - import groundingdino.datasets.transforms as T - - transform = T.Compose( - [ - T.RandomResize([800], max_size=1333), - T.ToTensor(), - T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), - ] - ) - image, _ = transform(pil_image, None) # 3, h, w - return image - - - # detects boxes usding openvino optimized grounding dino model - def get_ov_grounding_output( - model: ov.CompiledModel, - pil_image: Image.Image, - caption: Union[str, List[str]], - box_threshold: float, - text_threshold: float, - dino_tokenizer: transformers.PreTrainedTokenizerBase = dino_tokenizer, - max_text_len: int = max_text_len, - ): - # for text prompt pre-processing we reuse existing routines from GroundignDINO repo - if isinstance(caption, list): - caption = ". ".join(caption) - caption = caption.lower() - caption = caption.strip() - if not caption.endswith("."): - caption = caption + "." - captions = [caption] - - tokenized = dino_tokenizer(captions, padding="longest", return_tensors="pt") - specical_tokens = dino_tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"]) - - ( - text_self_attention_masks, - position_ids, - cate_to_token_mask_list, - ) = generate_masks_with_special_tokens_and_transfer_map(tokenized, specical_tokens, dino_tokenizer) - - if text_self_attention_masks.shape[1] > max_text_len: - text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len] - - position_ids = position_ids[:, :max_text_len] - tokenized["input_ids"] = tokenized["input_ids"][:, :max_text_len] - tokenized["attention_mask"] = tokenized["attention_mask"][:, :max_text_len] - tokenized["token_type_ids"] = tokenized["token_type_ids"][:, :max_text_len] - - # inputs dictionary which will be fed into the ov.CompiledModel for inference - inputs = {} - inputs["attention_mask.1"] = tokenized["attention_mask"] - inputs["text_self_attention_masks"] = text_self_attention_masks - inputs["input_ids"] = tokenized["input_ids"] - inputs["position_ids"] = position_ids - inputs["token_type_ids"] = tokenized["token_type_ids"] - - # GroundingDINO fails to run with input shapes different than one used for conversion. - # As a workaround we resize input_image to the size used for conversion. Model does not rely - # on image resolution to know object sizes therefore no need to resize box_predictions - from torchvision.transforms.functional import resize, InterpolationMode - - input_img = resize( - transform_image(pil_image), - ground_dino_img_size, - interpolation=InterpolationMode.BICUBIC, - )[None, ...] - inputs["samples"] = input_img - - # OpenVINO inference - request = model.create_infer_request() - request.start_async(inputs, share_inputs=False) - request.wait() - - def sig(x): - return 1 / (1 + np.exp(-x)) - - logits = torch.from_numpy(sig(np.squeeze(request.get_tensor("pred_logits").data, 0))) - boxes = torch.from_numpy(np.squeeze(request.get_tensor("pred_boxes").data, 0)) - - # filter output - filt_mask = logits.max(dim=1)[0] > box_threshold - logits, boxes = logits[filt_mask], boxes[filt_mask] - - # get phrase and build predictions - tokenized = dino_tokenizer(caption) - pred_phrases = [] - for logit in logits: - pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, dino_tokenizer) - pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})") - - return boxes, pred_phrases, logits.max(dim=1)[0] - -.. code:: ipython3 - - SOURCE_IMAGE_PATH = f"{ground_dino_dir}/.asset/demo7.jpg" - BOX_THRESHOLD = 0.3 - TEXT_THRESHOLD = 0.25 - NMS_THRESHOLD = 0.8 - - pil_image = Image.open(SOURCE_IMAGE_PATH) - classes_prompt = ["Horse", "Cloud"] - -.. code:: ipython3 - - boxes_filt, pred_phrases, logits_filt = get_ov_grounding_output(ov_compiled_grounded_dino, pil_image, classes_prompt, BOX_THRESHOLD, TEXT_THRESHOLD) - -Convert predicted boxes to supervision box detections format - -.. code:: ipython3 - - source_w, source_h = pil_image.size - detections = Model.post_process_result(source_h=source_h, source_w=source_w, boxes=boxes_filt, logits=logits_filt) - - class_id = Model.phrases2classes(phrases=pred_phrases, classes=list(map(str.lower, classes_prompt))) - detections.class_id = class_id - -Draw box detections - -.. code:: ipython3 - - box_annotator = sv.BoxAnnotator() - label_annotator = sv.LabelAnnotator() - labels = [f"{classes_prompt[class_id] if class_id is not None else 'None'} {confidence:0.2f}" for _, _, confidence, class_id, _, _ in detections] - annotated_frame = box_annotator.annotate(scene=np.array(pil_image).copy(), detections=detections) - annotated_frame = label_annotator.annotate(scene=annotated_frame.copy(), detections=detections, labels=labels) - - Image.fromarray(annotated_frame) - - - - -.. image:: grounded-segment-anything-with-output_files/grounded-segment-anything-with-output_30_0.png - - - -Great! All clouds and horses are detected. Feel free to play around and -specify other objects you wish to detect. - -Convert SAM to OpenVINO IR -~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -And now let’s feed those detection to ``SAM`` model. We will use -``EfficiendSAM`` for faster computation and to save ram, but feel free -to select vanilla ``SAM`` if you wish more detailed and precise -segmentation. First of all let’s convert ``SAM`` model to OpenVINO IR. - -.. code:: ipython3 - - ov_efficient_sam_name = "openvino_efficient_sam" - ov_efficient_sam_path = IRS_PATH / f"{ov_efficient_sam_name}.xml" - - # convert EfficientSAM to OpenVINO IR format - if not ov_efficient_sam_path.exists() and use_efficient_sam: - random_input_image = np.random.rand(1, 3, *pil_image.size[::-1]).astype(np.float32) - bounding_box = np.array([900, 100, 1000, 200]).reshape([1, 1, 2, 2]) - bbox_labels = np.array([2, 3]).reshape([1, 1, 2]) - efficient_sam_dummy_input = tuple(torch.from_numpy(x) for x in (random_input_image, bounding_box, bbox_labels)) - - ov_efficient_sam = ov.convert_model(efficient_sam_model, example_input=efficient_sam_dummy_input) - ov.save_model(ov_efficient_sam, ov_efficient_sam_path) - elif use_efficient_sam: - ov_efficient_sam = core.read_model(ov_efficient_sam_path) - - -.. parsed-literal:: - - TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - - -Below is conversion of vanilla ``SAM``. This code is not used when -``EfficientSAM`` is selected for segmentation. - -.. code:: ipython3 - - # In order to convert to OpenVINO IR neeed to patch forward method or the torch.nn.Module for SAM - class SamMaskFromBoxes(torch.nn.Module): - def __init__( - self, - sam_predictor, - ) -> None: - super().__init__() - self.model = sam_predictor - - @torch.no_grad() - def forward( - self, - input_image: torch.Tensor, - transformed_boxes: torch.Tensor, - multimask_output: bool = False, - hq_token_only: bool = False, - ): - pre_processed_image = self.model.model.preprocess(input_image) - image_embeddings, interm_features = self.model.model.image_encoder(pre_processed_image) - - # Embed prompts - sparse_embeddings, dense_embeddings = self.model.model.prompt_encoder( - points=None, - boxes=transformed_boxes, - masks=None, - ) - - # Predict masks - low_res_masks, iou_predictions = self.model.model.mask_decoder( - image_embeddings=image_embeddings, - image_pe=self.model.model.prompt_encoder.get_dense_pe(), - sparse_prompt_embeddings=sparse_embeddings, - dense_prompt_embeddings=dense_embeddings, - multimask_output=multimask_output, - hq_token_only=hq_token_only, - interm_embeddings=interm_features, - ) - - return low_res_masks, iou_predictions - -.. code:: ipython3 - - ov_sam_path = IRS_PATH / f"{ov_sam_name}.xml" - - # example input for vanilla SAM - input_image_torch = torch.randint(0, 255, size=[1, 3, 683, 1024], dtype=torch.uint8) - dummy_transformed_boxes = torch.rand(1, 4, dtype=torch.float32) * 200 - - # convert vanilla SAM to OpenVINO IR format - if not ov_sam_path.exists() and not use_efficient_sam: - # Load pytorch model object and prepare example input for conversion - exportable = SamMaskFromBoxes(sam_predictor) - exportable.model.model.eval() - for par in exportable.model.model.parameters(): - par.requires_grad = False - - traced = torch.jit.trace(exportable, example_inputs=(input_image_torch, dummy_transformed_boxes)) - ov_sam = ov.convert_model(traced, example_input=(input_image_torch, dummy_transformed_boxes)) - ov.save_model(ov_sam, ov_sam_path) - elif not use_efficient_sam: - ov_sam = core.read_model(ov_sam_path) - -.. code:: ipython3 - - if use_efficient_sam: - compiled_efficient_sam = core.compile_model(ov_efficient_sam, device_name=device.value) - else: - compiled_vanilla_sam = core.compile_model(ov_sam, device_name=device.value) - -Combine GroundingDINO + SAM (GroundedSAM) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We have OpenVINO IRs for both GroundingDINO and SAM models. Lets run the -segmentation using predictions from GroundingDINO. Same as above, use -``EfficientSAM`` by default. - -.. code:: ipython3 - - def predict_efficient_sam_mask(compiled_efficient_sam: ov.CompiledModel, image: Image.Image, bbox: torch.Tensor): - # input image is scaled so that none of the sizes is greater than 1024, same as in efficient-sam notebook - input_size = 1024 - w, h = image.size[:2] - scale = input_size / max(w, h) - new_w = int(w * scale) - new_h = int(h * scale) - image = image.resize((new_w, new_h)) - - numpy_image = np.array(image, dtype=np.float32) / 255.0 - numpy_image = np.transpose(numpy_image, (2, 0, 1))[None, ...] - - scaled_points = bbox * scale - - bounding_box = scaled_points.reshape([1, 1, 2, 2]) - bbox_labels = np.reshape(np.array([2, 3]), [1, 1, 2]) - - res = compiled_efficient_sam((numpy_image, bounding_box, bbox_labels)) - - predicted_logits, predicted_iou = res[0], res[1] - - all_masks = torch.ge(torch.sigmoid(torch.from_numpy(predicted_logits[0, 0, :, :, :])), 0.5).numpy() - predicted_iou = predicted_iou[0, 0, ...] - - # select the mask with the greatest IOU - max_predicted_iou = -1 - selected_mask_using_predicted_iou = None - for m in range(all_masks.shape[0]): - curr_predicted_iou = predicted_iou[m] - if curr_predicted_iou > max_predicted_iou or selected_mask_using_predicted_iou is None: - max_predicted_iou = curr_predicted_iou - selected_mask_using_predicted_iou = all_masks[m] - return selected_mask_using_predicted_iou - - - # If several detections are fed to EfficientSAM, it merges them to a single mask. Therefore, we call it one by one for each detection. - def predict_efficient_sam_masks(compiled_efficient_sam: ov.CompiledModel, pil_image: Image.Image, transformed_boxes) -> torch.Tensor: - masks = [] - for bbox in transformed_boxes: - mask = predict_efficient_sam_mask(compiled_efficient_sam, pil_image, bbox) - mask = Image.fromarray(mask).resize(pil_image.size) - masks.append(np.array(mask)) - masks = torch.from_numpy(np.array(masks)) - return masks - -.. code:: ipython3 - - def transform_boxes(sam_predictor: torch.nn.Module, boxes: torch.Tensor, size: tuple) -> torch.Tensor: - H, W = size[0], size[1] - for i in range(boxes.size(0)): - boxes[i] = boxes[i] * torch.Tensor([W, H, W, H]) - boxes[i][:2] -= boxes[i][2:] / 2 - boxes[i][2:] += boxes[i][:2] - - return sam_predictor.transform.apply_boxes_torch(boxes, size).to(PT_DEVICE) - - - def predict_vanilla_sam_masks( - compiled_vanilla_sam: ov.CompiledModel, - image: np.ndarray, - transformed_boxes: torch.Tensor, - ) -> torch.Tensor: - transfromed_image = exportable.model.transform.apply_image(image) - input_image_torch = torch.as_tensor(transfromed_image, device=PT_DEVICE) - input_image_torch = input_image_torch.permute(2, 0, 1).contiguous()[None, :, :, :] - - original_size = tuple(image.shape[:2]) - input_size = tuple(input_image_torch.shape[-2:]) - - low_res_masks = compiled_vanilla_sam((input_image_torch, transformed_boxes))[0] - - # Upscale the masks to the original image resolution - masks = exportable.model.model.postprocess_masks(torch.from_numpy(low_res_masks), input_size, original_size) - masks = masks > exportable.model.model.mask_threshold - return masks - -Run SAM model for the same image with the detected boxes from -GroundingDINO. - -Please note that vanilla SAM and EfficientSAM have slightly different -detection formats. But inputs for both of them originate from -``boxes_filt`` which is result of the ``get_ov_grounding_output``. For -EfficientSAM we use ``detections.xyxy`` boxes obtained after -``boxes_filt`` is fed to ``Model.post_process_result``. While vanilla -SAM has it’s own preprocessing function ``transform_boxes``. - -.. code:: ipython3 - - if use_efficient_sam: - masks = predict_efficient_sam_masks(compiled_efficient_sam, pil_image, detections.xyxy) - detections.mask = masks.numpy() - else: - transformed_boxes = transform_boxes(sam_predictor, boxes_filt, pil_image.size[::-1]) - masks = predict_vanilla_sam_masks(compiled_vanilla_sam, np.array(pil_image), transformed_boxes) - detections.mask = masks[:, 0].numpy() - -Combine both boxes and segmentation masks and draw them. - -.. code:: ipython3 - - box_annotator = sv.BoxAnnotator() - mask_annotator = sv.MaskAnnotator() - label_annotator = sv.LabelAnnotator() - - annotated_image = np.array(pil_image) - annotated_image = mask_annotator.annotate(scene=np.array(pil_image).copy(), detections=detections) - annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections) - annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections, labels=labels) - - - Image.fromarray(annotated_image) - - - - -.. image:: grounded-segment-anything-with-output_files/grounded-segment-anything-with-output_46_0.png - - - -Great! All detected horses and clouds are segmented as well. - -Interactive GroundedSAM -~~~~~~~~~~~~~~~~~~~~~~~ - - - -Now, you can try apply grounding sam on your own images using -interactive demo. The code below provides helper functions used in -demonstration. - -.. code:: ipython3 - - def draw_mask(mask, draw, random_color=False): - if random_color: - color = ( - np.random.randint(0, 255), - np.random.randint(0, 255), - np.random.randint(0, 255), - 153, - ) - else: - color = (30, 144, 255, 153) - - nonzero_coords = np.transpose(np.nonzero(mask)) - - for coord in nonzero_coords: - draw.point(coord[::-1], fill=color) - - - def draw_box(box, draw, label): - # random color - color = tuple(np.random.randint(0, 255, size=3).tolist()) - - draw.rectangle(((box[0], box[1]), (box[2], box[3])), outline=color, width=4) - - if label: - font = ImageFont.load_default(18) - if hasattr(font, "getbbox"): - bbox = draw.textbbox((box[0], box[1]), str(label), font, anchor="ld") - else: - w, h = draw.textsize(str(label), font) - bbox = (box[0], box[1], box[0] + w, box[1] + h) - draw.rectangle(bbox, fill=color) - draw.text((box[0], box[1]), str(label), fill="white", anchor="ld", font=font) - -.. code:: ipython3 - - import gradio as gr - - """" - run_grounding_sam is called every time "Submit" button is clicked - """ - - - def run_grounding_sam(image, task_type, text_prompt, box_threshold, text_threshold): - pil_image = Image.fromarray(image) - size = image.shape[1], image.shape[0] # size is WH image.shape HWC - - boxes_filt, scores, pred_phrases = get_ov_grounding_output(ov_compiled_grounded_dino, pil_image, text_prompt, box_threshold, text_threshold) - - # process boxes - H, W = size[1], size[0] - for i in range(boxes_filt.size(0)): - boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H]) - boxes_filt[i][:2] -= boxes_filt[i][2:] / 2 - boxes_filt[i][2:] += boxes_filt[i][:2] - - if task_type == "seg": - if use_efficient_sam: - masks = predict_efficient_sam_masks(compiled_efficient_sam, pil_image, boxes_filt.numpy()) - else: - transformed_boxes = sam_predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2]).to(PT_DEVICE) - masks = predict_vanilla_sam_masks(compiled_vanilla_sam, image, transformed_boxes)[:, 0] - - mask_image = Image.new("RGBA", size, color=(0, 0, 0, 0)) - mask_draw = ImageDraw.Draw(mask_image) - for mask in masks: - draw_mask(mask.numpy(), mask_draw, random_color=True) - - image_draw = ImageDraw.Draw(pil_image) - for box, label in zip(boxes_filt, pred_phrases): - draw_box(box, image_draw, label) - - pil_image = pil_image.convert("RGBA") - pil_image.alpha_composite(mask_image) - - return [pil_image, mask_image] - if task_type == "det": - image_draw = ImageDraw.Draw(pil_image) - for box, label in zip(boxes_filt, pred_phrases): - draw_box(box, image_draw, label) - return [pil_image] - else: - gr.Warning(f"task_type:{task_type} error!") - -You can run interactive app with your own image and text prompts. To -define prompt specify comma (or conjunction) separated names of objects -you wish to segment. For demonstration, this demo already has two -predefined examples. If many object are crowded and overlapping please -increase threshold values in ``Advanced options``. - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/grounded-segment-anything/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=run_grounding_sam) - - try: - demo.launch(debug=False, height=1000) - except Exception: - demo.launch(share=True, debug=False, height=1000) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ - - -.. parsed-literal:: - - Running on local URL: http://127.0.0.1:7860 - - To create a public link, set `share=True` in `launch()`. - - - - - - - - -Cleanup -~~~~~~~ - - - -.. code:: ipython3 - - # import shutil - # shutil.rmtree(CKPT_BASE_PATH) - # shutil.rmtree(IRS_PATH) diff --git a/docs/notebooks/grounded-segment-anything-with-output_files/grounded-segment-anything-with-output_30_0.jpg b/docs/notebooks/grounded-segment-anything-with-output_files/grounded-segment-anything-with-output_30_0.jpg deleted file mode 100644 index 3bb8cb4c005844..00000000000000 --- a/docs/notebooks/grounded-segment-anything-with-output_files/grounded-segment-anything-with-output_30_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b77359a6b56eb97230fc534071549c03c40e608700289f8a776b44538dd36bef -size 397455 diff --git a/docs/notebooks/grounded-segment-anything-with-output_files/grounded-segment-anything-with-output_30_0.png b/docs/notebooks/grounded-segment-anything-with-output_files/grounded-segment-anything-with-output_30_0.png deleted file mode 100644 index 2171feecfaffaa..00000000000000 --- a/docs/notebooks/grounded-segment-anything-with-output_files/grounded-segment-anything-with-output_30_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c1e8226253f213b3ab5d069d0f61f3786343efcc180ec5287ce8a5b9e4818b7 -size 2756537 diff --git a/docs/notebooks/grounded-segment-anything-with-output_files/grounded-segment-anything-with-output_46_0.jpg b/docs/notebooks/grounded-segment-anything-with-output_files/grounded-segment-anything-with-output_46_0.jpg deleted file mode 100644 index fb20bcf39cf3b4..00000000000000 --- a/docs/notebooks/grounded-segment-anything-with-output_files/grounded-segment-anything-with-output_46_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7ed9d707bf9eb6b77038d7d888570a6b2ab430de2eaf16dd18c99fe37529c339 -size 374818 diff --git a/docs/notebooks/grounded-segment-anything-with-output_files/grounded-segment-anything-with-output_46_0.png b/docs/notebooks/grounded-segment-anything-with-output_files/grounded-segment-anything-with-output_46_0.png deleted file mode 100644 index 347bba9672fefa..00000000000000 --- a/docs/notebooks/grounded-segment-anything-with-output_files/grounded-segment-anything-with-output_46_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cb3138d3712484c6e4643f545655e07fefe9e9dba55f45e386f6f0d4160fdf12 -size 2667202 diff --git a/docs/notebooks/handwritten-ocr-with-output.rst b/docs/notebooks/handwritten-ocr-with-output.rst deleted file mode 100644 index 7ac8112874e7e3..00000000000000 --- a/docs/notebooks/handwritten-ocr-with-output.rst +++ /dev/null @@ -1,434 +0,0 @@ -Handwritten Chinese and Japanese OCR with OpenVINO™ -=================================================== - -In this tutorial, we perform optical character recognition (OCR) for -handwritten Chinese (simplified) and Japanese. An OCR tutorial using the -Latin alphabet is available in `notebook -208 `__. -This model is capable of processing only one line of symbols at a time. - -The models used in this notebook are -`handwritten-japanese-recognition-0001 `__ -and -`handwritten-simplified-chinese-0001 `__. -To decode model outputs as readable text -`kondate_nakayosi `__ -and -`scut_ept `__ -charlists are used. Both models are available on `Open Model -Zoo `__. - - -**Table of contents:** - - -- `Imports <#imports>`__ -- `Settings <#settings>`__ -- `Select a Language <#select-a-language>`__ -- `Download the Model <#download-the-model>`__ -- `Load the Model and Execute <#load-the-model-and-execute>`__ -- `Select inference device <#select-inference-device>`__ -- `Fetch Information About Input and Output - Layers <#fetch-information-about-input-and-output-layers>`__ -- `Load an Image <#load-an-image>`__ -- `Visualize Input Image <#visualize-input-image>`__ -- `Prepare Charlist <#prepare-charlist>`__ -- `Run Inference <#run-inference>`__ -- `Process the Output Data <#process-the-output-data>`__ -- `Print the Output <#print-the-output>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - # Install openvino package - %pip install -q "openvino>=2023.1.0" opencv-python tqdm "matplotlib>=3.4" - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - - -Imports -------- - - - -.. code:: ipython3 - - from collections import namedtuple - from itertools import groupby - from pathlib import Path - - import cv2 - import matplotlib.pyplot as plt - import numpy as np - import openvino as ov - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("handwritten-ocr.ipynb") - -Settings --------- - - - -Set up all constants and folders used in this notebook - -.. code:: ipython3 - - # Directories where data will be placed. - base_models_dir = Path("models") - data_folder = Path("data") - charlist_folder = Path(f"{data_folder}/text") - - # Precision used by the model. - precision = "FP16" - -To group files, you have to define the collection. In this case, use -``namedtuple``. - -.. code:: ipython3 - - Language = namedtuple(typename="Language", field_names=["model_name", "charlist_name", "demo_image_name"]) - chinese_files = Language( - model_name="handwritten-simplified-chinese-recognition-0001", - charlist_name="chinese_charlist.txt", - demo_image_name="handwritten_chinese_test.jpg", - ) - japanese_files = Language( - model_name="handwritten-japanese-recognition-0001", - charlist_name="japanese_charlist.txt", - demo_image_name="handwritten_japanese_test.png", - ) - -Select a Language ------------------ - - - -Depending on your choice you will need to change a line of code in the -cell below. - -If you want to perform OCR on a text in Japanese, set -``language = "japanese"``. For Chinese, set ``language = "chinese"``. - -.. code:: ipython3 - - # Select the language by using either language="chinese" or language="japanese". - language = "chinese" - - languages = {"chinese": chinese_files, "japanese": japanese_files} - - selected_language = languages.get(language) - -Download the Model ------------------- - - - -In addition to images and charlists, you need to download the model -file. In the sections below, there are cells for downloading either the -Chinese or Japanese model. - -If it is your first time running the notebook, the model will be -downloaded. It may take a few minutes. - -Use ``download_file`` function from the utils package, which -automatically creates a directory structure and downloads the selected -model file. - -.. code:: ipython3 - - path_to_model = base_models_dir / f"{selected_language.model_name}.xml" - - if not path_to_model.exists(): - path_to_model = download_file( - url=f"https://storage.openvinotoolkit.org/repositories/open_model_zoo/2023.0/models_bin/1/{selected_language.model_name}/{precision}/{selected_language.model_name}.xml", - directory=base_models_dir, - ) - download_file( - url=f"https://storage.openvinotoolkit.org/repositories/open_model_zoo/2023.0/models_bin/1/{selected_language.model_name}/{precision}/{selected_language.model_name}.bin", - directory=base_models_dir, - ) - - - -.. parsed-literal:: - - handwritten-simplified-chinese-recognition-0001.xml: 0%| | 0.00/108k [00:00`__, -you will remove concurrent symbols and then remove the blanks. - -Finally, get the symbols from corresponding indexes in the charlist. - -.. code:: ipython3 - - # Remove a batch dimension. - predictions = np.squeeze(predictions) - - # Run the `argmax` function to pick the symbols with the highest probability. - predictions_indexes = np.argmax(predictions, axis=1) - -.. code:: ipython3 - - # Use the `groupby` function to remove concurrent letters, as required by CTC greedy decoding. - output_text_indexes = list(groupby(predictions_indexes)) - - # Remove grouper objects. - output_text_indexes, _ = np.transpose(output_text_indexes, (1, 0)) - - # Remove blank symbols. - output_text_indexes = output_text_indexes[output_text_indexes != 0] - - # Assign letters to indexes from the output array. - output_text = [letters[letter_index] for letter_index in output_text_indexes] - -Print the Output ----------------- - - - -Now, having a list of letters predicted by the model, you can display -the image with predicted text printed below. - -.. code:: ipython3 - - plt.figure(figsize=(20, 1)) - plt.axis("off") - plt.imshow(resized_image, cmap="gray", vmin=0, vmax=255) - - print("".join(output_text)) - - -.. parsed-literal:: - - 人有悲欢离合,月有阴睛圆缺,此事古难全。 - - - -.. image:: handwritten-ocr-with-output_files/handwritten-ocr-with-output_32_1.png - diff --git a/docs/notebooks/handwritten-ocr-with-output_files/handwritten-ocr-with-output_22_0.png b/docs/notebooks/handwritten-ocr-with-output_files/handwritten-ocr-with-output_22_0.png deleted file mode 100644 index 7e0c09a703a97b..00000000000000 --- a/docs/notebooks/handwritten-ocr-with-output_files/handwritten-ocr-with-output_22_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9ce052db324821165a2b1bc5dea9d05588886c1794c0f217aaa47b8442c76aad -size 53571 diff --git a/docs/notebooks/handwritten-ocr-with-output_files/handwritten-ocr-with-output_32_1.png b/docs/notebooks/handwritten-ocr-with-output_files/handwritten-ocr-with-output_32_1.png deleted file mode 100644 index 7e0c09a703a97b..00000000000000 --- a/docs/notebooks/handwritten-ocr-with-output_files/handwritten-ocr-with-output_32_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9ce052db324821165a2b1bc5dea9d05588886c1794c0f217aaa47b8442c76aad -size 53571 diff --git a/docs/notebooks/hello-detection-with-output.rst b/docs/notebooks/hello-detection-with-output.rst deleted file mode 100644 index c065b2220868fa..00000000000000 --- a/docs/notebooks/hello-detection-with-output.rst +++ /dev/null @@ -1,274 +0,0 @@ -Hello Object Detection -====================== - -A very basic introduction to using object detection models with -OpenVINO™. - -The -`horizontal-text-detection-0001 `__ -model from `Open Model -Zoo `__ is used. It -detects horizontal text in images and returns a blob of data in the -shape of ``[100, 5]``. Each detected text box is stored in the -``[x_min, y_min, x_max, y_max, conf]`` format, where the -``(x_min, y_min)`` are the coordinates of the top left bounding box -corner, ``(x_max, y_max)`` are the coordinates of the bottom right -bounding box corner and ``conf`` is the confidence for the predicted -class. - - -**Table of contents:** - - -- `Imports <#imports>`__ -- `Download model weights <#download-model-weights>`__ -- `Select inference device <#select-inference-device>`__ -- `Load the Model <#load-the-model>`__ -- `Load an Image <#load-an-image>`__ -- `Do Inference <#do-inference>`__ -- `Visualize Results <#visualize-results>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - # Install openvino package - %pip install -q "openvino>=2023.1.0" opencv-python tqdm "matplotlib>=3.4" - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - - -Imports -------- - - - -.. code:: ipython3 - - import cv2 - import matplotlib.pyplot as plt - import numpy as np - import openvino as ov - from pathlib import Path - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("hello-detection.ipynb") - -Download model weights ----------------------- - - - -.. code:: ipython3 - - base_model_dir = Path("./model").expanduser() - - model_name = "horizontal-text-detection-0001" - model_xml_name = f"{model_name}.xml" - model_bin_name = f"{model_name}.bin" - - model_xml_path = base_model_dir / model_xml_name - model_bin_path = base_model_dir / model_bin_name - - if not model_xml_path.exists(): - model_xml_url = "https://storage.openvinotoolkit.org/repositories/open_model_zoo/2022.3/models_bin/1/horizontal-text-detection-0001/FP32/horizontal-text-detection-0001.xml" - model_bin_url = "https://storage.openvinotoolkit.org/repositories/open_model_zoo/2022.3/models_bin/1/horizontal-text-detection-0001/FP32/horizontal-text-detection-0001.bin" - - download_file(model_xml_url, model_xml_name, base_model_dir) - download_file(model_bin_url, model_bin_name, base_model_dir) - else: - print(f"{model_name} already downloaded to {base_model_dir}") - - - -.. parsed-literal:: - - horizontal-text-detection-0001.xml: 0%| | 0.00/680k [00:00 threshold: - # Convert float to int and multiply corner position of each box by x and y ratio. - # If the bounding box is found at the top of the image, - # position the upper box bar little lower to make it visible on the image. - (x_min, y_min, x_max, y_max) = [ - (int(max(corner_position * ratio_y, 10)) if idx % 2 else int(corner_position * ratio_x)) for idx, corner_position in enumerate(box[:-1]) - ] - - # Draw a box based on the position, parameters in rectangle function are: image, start_point, end_point, color, thickness. - rgb_image = cv2.rectangle(rgb_image, (x_min, y_min), (x_max, y_max), colors["green"], 3) - - # Add text to the image based on position and confidence. - # Parameters in text function are: image, text, bottom-left_corner_textfield, font, font_scale, color, thickness, line_type. - if conf_labels: - rgb_image = cv2.putText( - rgb_image, - f"{conf:.2f}", - (x_min, y_min - 10), - cv2.FONT_HERSHEY_SIMPLEX, - 0.8, - colors["red"], - 1, - cv2.LINE_AA, - ) - - return rgb_image - -.. code:: ipython3 - - plt.figure(figsize=(10, 6)) - plt.axis("off") - plt.imshow(convert_result_to_image(image, resized_image, boxes, conf_labels=False)); - - - -.. image:: hello-detection-with-output_files/hello-detection-with-output_16_0.png - diff --git a/docs/notebooks/hello-detection-with-output_files/hello-detection-with-output_11_1.png b/docs/notebooks/hello-detection-with-output_files/hello-detection-with-output_11_1.png deleted file mode 100644 index b696d287ded448..00000000000000 --- a/docs/notebooks/hello-detection-with-output_files/hello-detection-with-output_11_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c7a830fedc5653fd506c656144decc048cad5a7651c8e498024f0eb0ab8c8e96 -size 305482 diff --git a/docs/notebooks/hello-detection-with-output_files/hello-detection-with-output_16_0.png b/docs/notebooks/hello-detection-with-output_files/hello-detection-with-output_16_0.png deleted file mode 100644 index 5e6438a788597e..00000000000000 --- a/docs/notebooks/hello-detection-with-output_files/hello-detection-with-output_16_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:edb00cb4f0e2c42cd9e0f90939afbd6352ca40c90866821898f2c42c1fd9df64 -size 457214 diff --git a/docs/notebooks/hello-npu-with-output.rst b/docs/notebooks/hello-npu-with-output.rst deleted file mode 100644 index 35386e70fc6049..00000000000000 --- a/docs/notebooks/hello-npu-with-output.rst +++ /dev/null @@ -1,965 +0,0 @@ -Hello NPU -========= - -Working with NPU in OpenVINO™ ------------------------------ - - -**Table of contents:** - - -- `Introduction <#introduction>`__ - - - `Install required packages <#install-required-packages>`__ - -- `Checking NPU with Query Device <#checking-npu-with-query-device>`__ - - - `List the NPU with - core.available_devices <#list-the-npu-with-core-available_devices>`__ - - `Check Properties with - core.get_property <#check-properties-with-core-get_property>`__ - - `Brief Descriptions of Key - Properties <#brief-descriptions-of-key-properties>`__ - -- `Compiling a Model on NPU <#compiling-a-model-on-npu>`__ - - - `Download a Model <#download-and-convert-a-model>`__ - - `Compile with Default - Configuration <#compile-with-default-configuration>`__ - - `Reduce Compile Time through Model - Caching <#reduce-compile-time-through-model-caching>`__ - - - `UMD Model Caching <#umd-model-caching>`__ - - `OpenVINO Model Caching <#openvino-model-caching>`__ - - - `Throughput and Latency Performance - Hints <#throughput-and-latency-performance-hints>`__ - -- `Performance Comparison with - benchmark_app <#performance-comparison-with-benchmark_app>`__ - - - `NPU vs CPU with Latency Hint <#npu-vs-cpu-with-latency-hint>`__ - - - `Effects of UMD Model - Caching <#effects-of-umd-model-caching>`__ - - - `NPU vs CPU with Throughput - Hint <#npu-vs-cpu-with-throughput-hint>`__ - -- `Limitations <#limitations>`__ -- `Conclusion <#conclusion>`__ - - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -This tutorial provides a high-level overview of working with the NPU -device **Intel(R) AI Boost** (introduced with the Intel® Core™ Ultra -generation of CPUs) in OpenVINO. It explains some of the key properties -of the NPU and shows how to compile a model on NPU with performance -hints. - -This tutorial also shows example commands for benchmark_app that can be -run to compare NPU performance with CPU in different configurations. - -Introduction ------------- - - - -The Neural Processing Unit (NPU) is a low power hardware solution which -enables you to offload certain neural network computation tasks from -other devices, for more streamlined resource management. - -Note that the NPU plugin is included in PIP installation of OpenVINO™ -and you need to `install a proper NPU -driver `__ -to use it successfully. - -| **Supported Platforms**: -| Host: Intel® Core™ Ultra -| NPU device: NPU 3720 -| OS: Ubuntu 22.04 (with Linux Kernel 6.6+), MS Windows 11 (both 64-bit) - -To learn more about the NPU Device, see the -`page `__. - -Install required packages -~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %pip install -q "openvino>=2024.1.0" huggingface_hub - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("hello-npu.ipynb") - -Checking NPU with Query Device ------------------------------- - - - -In this section, we will see how to list the available NPU and check its -properties. Some of the key properties will be defined. - -List the NPU with core.available_devices -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -OpenVINO Runtime provides the ``available_devices`` method for checking -which devices are available for inference. The following code will -output a list a compatible OpenVINO devices, in which Intel NPU should -appear (ensure that the driver is installed successfully). - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - core.available_devices - - - - -.. parsed-literal:: - - ['CPU', 'GPU', 'NPU'] - - - -Check Properties with core.get_property -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To get information about the NPU, we can use device properties. In -OpenVINO, devices have properties that describe their characteristics -and configurations. Each property has a name and associated value that -can be queried with the ``get_property`` method. - -To get the value of a property, such as the device name, we can use the -``get_property`` method as follows: - -.. code:: ipython3 - - import openvino.properties as props - - - device = "NPU" - - core.get_property(device, props.device.full_name) - - - - -.. parsed-literal:: - - 'Intel(R) AI Boost' - - - -Each device also has a specific property called -``SUPPORTED_PROPERTIES``, that enables viewing all the available -properties in the device. We can check the value for each property by -simply looping through the dictionary returned by -``core.get_property("NPU", props.supported_properties)`` and then -querying for that property. - -.. code:: ipython3 - - print(f"{device} SUPPORTED_PROPERTIES:\n") - supported_properties = core.get_property(device, props.supported_properties) - indent = len(max(supported_properties, key=len)) - - for property_key in supported_properties: - if property_key not in ("SUPPORTED_METRICS", "SUPPORTED_CONFIG_KEYS", "SUPPORTED_PROPERTIES"): - try: - property_val = core.get_property(device, property_key) - except TypeError: - property_val = "UNSUPPORTED TYPE" - print(f"{property_key:<{indent}}: {property_val}") - -Brief Descriptions of Key Properties -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Each device has several properties as seen in the last command. Some of -the key properties are: - ``FULL_DEVICE_NAME`` - The product name of the -NPU. - ``PERFORMANCE_HINT`` - A high-level way to tune the device for a -specific performance metric, such as latency or throughput, without -worrying about device-specific settings. - ``CACHE_DIR`` - The directory -where the OpenVINO model cache data is stored to speed up the -compilation time. - ``OPTIMIZATION_CAPABILITIES`` - The model data types -(INT8, FP16, FP32, etc) that are supported by this NPU. - -To learn more about devices and properties, see the `Query Device -Properties `__ -page. - -Compiling a Model on NPU ------------------------- - - - -Now, we know the NPU present in the system and we have checked its -properties. We can easily use it for compiling and running models with -OpenVINO NPU plugin. - -Download a Model ----------------- - - - -This tutorial uses the ``resnet50`` model. The ``resnet50`` model is -used for image classification tasks. The model was trained on -`ImageNet `__ dataset which -contains over a million images categorized into 1000 classes. To read -more about resnet50, see the -`paper `__. As our -tutorial focused on inference part, we skip model conversion step. To -convert this Pytorch model to OpenVINO IR, `Model Conversion -API `__ -should be used. Please check this -`tutorial `__ -for details how to convert pytorch model. - -.. code:: ipython3 - - from pathlib import Path - - # create a directory for resnet model file - MODEL_DIRECTORY_PATH = Path("model") - MODEL_DIRECTORY_PATH.mkdir(exist_ok=True) - - model_name = "resnet50" - -.. code:: ipython3 - - import huggingface_hub as hf_hub - -.. code:: ipython3 - - precision = "FP16" - - model_path = MODEL_DIRECTORY_PATH / "ir_model" / f"{model_name}_{precision.lower()}.xml" - - model = None - if not model_path.exists(): - hf_hub.snapshot_download("katuni4ka/resnet50_fp16", local_dir=model_path.parent) - print("IR model saved to {}".format(model_path)) - model = core.read_model(model_path) - else: - print("Read IR model from {}".format(model_path)) - model = core.read_model(model_path) - - -.. parsed-literal:: - - Read IR model from model\ir_model\resnet50_fp16.xml - - -**Note:** NPU also supports ``INT8`` quantized models. - -Compile with Default Configuration -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -When the model is ready, first we need to read it, using the -``read_model`` method. Then, we can use the ``compile_model`` method and -specify the name of the device we want to compile the model on, in this -case, “NPU”. - -.. code:: ipython3 - - compiled_model = core.compile_model(model, device) - -Reduce Compile Time through Model Caching -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Depending on the model used, device-specific optimizations and network -compilations can cause the compile step to be time-consuming, especially -with larger models, which may lead to bad user experience in the -application. To solve this **Model Caching** can be used. - -Model Caching helps reduce application startup delays by exporting and -reusing the compiled model automatically. The following two -compilation-related metrics are crucial in this area: - -- **First-Ever Inference Latency (FEIL)**: - Measures all steps required to compile and execute a model on the - device for the first time. It includes model compilation time, the - time required to load and initialize the model on the device and the - first inference execution. -- **First Inference Latency (FIL)**: - Measures the time required to load and initialize the pre-compiled - model on the device and the first inference execution. - -In NPU, UMD model caching is a solution enabled by default by the -driver. It improves time to first inference (FIL) by storing the model -in the cache after compilation (included in FEIL). Learn more about UMD -Caching -`here `__. -Due to this caching, it takes lesser time to load the model after first -compilation. - -| You can also use OpenVINO Model Caching, which is a common mechanism - for all OpenVINO device plugins and can be enabled by setting the - ``cache_dir`` property. -| By enabling OpenVINO Model Caching, the UMD caching is automatically - bypassed by the NPU plugin, which means the model will only be stored - in the OpenVINO cache after compilation. When a cache hit occurs for - subsequent compilation requests, the plugin will import the model - instead of recompiling it. - -UMD Model Caching -^^^^^^^^^^^^^^^^^ - - - -To see how UMD caching see the following example: - -.. code:: ipython3 - - import time - from pathlib import Path - - start = time.time() - core = ov.Core() - - # Compile the model as before - model = core.read_model(model=model_path) - compiled_model = core.compile_model(model, device) - print(f"UMD Caching (first time) - compile time: {time.time() - start}s") - - -.. parsed-literal:: - - UMD Caching (first time) - compile time: 3.2854952812194824s - - -.. code:: ipython3 - - start = time.time() - core = ov.Core() - - # Compile the model once again to see UMD Caching - model = core.read_model(model=model_path) - compiled_model = core.compile_model(model, device) - print(f"UMD Caching - compile time: {time.time() - start}s") - - -.. parsed-literal:: - - UMD Caching - compile time: 2.269814968109131s - - -OpenVINO Model Caching -^^^^^^^^^^^^^^^^^^^^^^ - - - -To get an idea of OpenVINO model caching, we can use the OpenVINO cache -as follow - -.. code:: ipython3 - - # Create cache folder - cache_folder = Path("cache") - cache_folder.mkdir(exist_ok=True) - - start = time.time() - core = ov.Core() - - # Set cache folder - core.set_property({props.cache_dir(): cache_folder}) - - # Compile the model - model = core.read_model(model=model_path) - compiled_model = core.compile_model(model, device) - print(f"Cache enabled (first time) - compile time: {time.time() - start}s") - - start = time.time() - core = ov.Core() - - # Set cache folder - core.set_property({props.cache_dir(): cache_folder}) - - # Compile the model as before - model = core.read_model(model=model_path) - compiled_model = core.compile_model(model, device) - print(f"Cache enabled (second time) - compile time: {time.time() - start}s") - - -.. parsed-literal:: - - Cache enabled (first time) - compile time: 0.6362860202789307s - Cache enabled (second time) - compile time: 0.3032548427581787s - - -And when the OpenVINO cache is disabled: - -.. code:: ipython3 - - start = time.time() - core = ov.Core() - model = core.read_model(model=model_path) - compiled_model = core.compile_model(model, device) - print(f"Cache disabled - compile time: {time.time() - start}s") - - -.. parsed-literal:: - - Cache disabled - compile time: 3.0127954483032227s - - -The actual time improvements will depend on the environment as well as -the model being used but it is definitely something to consider when -optimizing an application. To read more about this, see the `Model -Caching -docs `__. - -Throughput and Latency Performance Hints -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To simplify device and pipeline configuration, OpenVINO provides -high-level performance hints that automatically set the batch size and -number of parallel threads for inference. The “LATENCY” performance hint -optimizes for fast inference times while the “THROUGHPUT” performance -hint optimizes for high overall bandwidth or FPS. - -To use the “LATENCY” performance hint, add -``{hints.performance_mode(): hints.PerformanceMode.LATENCY}`` when -compiling the model as shown below. For NPU, this automatically -minimizes the batch size and number of parallel streams such that all of -the compute resources can focus on completing a single inference as fast -as possible. - -.. code:: ipython3 - - import openvino.properties.hint as hints - - - compiled_model = core.compile_model(model, device, {hints.performance_mode(): hints.PerformanceMode.LATENCY}) - -To use the “THROUGHPUT” performance hint, add -``{hints.performance_mode(): hints.PerformanceMode.THROUGHPUT}`` when -compiling the model. For NPUs, this creates multiple processing streams -to efficiently utilize all the execution cores and optimizes the batch -size to fill the available memory. - -.. code:: ipython3 - - compiled_model = core.compile_model(model, device, {hints.performance_mode(): hints.PerformanceMode.THROUGHPUT}) - -Performance Comparison with benchmark_app ------------------------------------------ - - - -Given all the different options available when compiling a model, it may -be difficult to know which settings work best for a certain application. -Thankfully, OpenVINO provides ``benchmark_app`` - a performance -benchmarking tool. - -The basic syntax of ``benchmark_app`` is as follows: - -``benchmark_app -m PATH_TO_MODEL -d TARGET_DEVICE -hint {throughput,cumulative_throughput,latency,none}`` - -where ``TARGET_DEVICE`` is any device shown by the ``available_devices`` -method as well as the MULTI and AUTO devices we saw previously, and the -value of hint should be one of the values between brackets. - -Note that benchmark_app only requires the model path to run but both -device and hint arguments will be useful to us. For more advanced -usages, the tool itself has other options that can be checked by running -``benchmark_app -h`` or reading the -`docs `__. -The following example shows us to benchmark a simple model, using a NPU -with latency focus: - -``benchmark_app -m {model_path} -d NPU -hint latency`` - -| For completeness, let us list here some of the comparisons we may want - to do by varying the device and hint used. Note that the actual - performance may depend on the hardware used. Generally, we should - expect NPU to be better than CPU. -| Please refer to the ``benchmark_app`` log entries under - ``[Step 11/11] Dumping statistics report`` to observe the differences - in latency and throughput between the CPU and NPU.. - -NPU vs CPU with Latency Hint -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -.. code:: ipython3 - - !benchmark_app -m {model_path} -d CPU -hint latency - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.1.0-14992-621b025bef4 - [ INFO ] - [ INFO ] Device info: - [ INFO ] CPU - [ INFO ] Build ................................. 2024.1.0-14992-621b025bef4 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 14.00 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,3,224,224] - [ INFO ] Model outputs: - [ INFO ] x.45 (node: aten::linear/Add) : f32 / [...] / [1,1000] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,224,224] - [ INFO ] Model outputs: - [ INFO ] x.45 (node: aten::linear/Add) : f32 / [...] / [1,1000] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 143.22 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model2 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1 - [ INFO ] NUM_STREAMS: 1 - [ INFO ] AFFINITY: Affinity.HYBRID_AWARE - [ INFO ] INFERENCE_NUM_THREADS: 12 - [ INFO ] PERF_COUNT: NO - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] PERFORMANCE_HINT: LATENCY - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] ENABLE_CPU_PINNING: False - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] ENABLE_HYPER_THREADING: False - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 0 - [ INFO ] KV_CACHE_PRECISION: - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 1 inference requests, limits: 60000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 28.95 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 1612 iterations - [ INFO ] Duration: 60039.72 ms - [ INFO ] Latency: - [ INFO ] Median: 39.99 ms - [ INFO ] Average: 37.13 ms - [ INFO ] Min: 19.13 ms - [ INFO ] Max: 71.94 ms - [ INFO ] Throughput: 26.85 FPS - - -.. code:: ipython3 - - !benchmark_app -m {model_path} -d NPU -hint latency - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.1.0-14992-621b025bef4 - [ INFO ] - [ INFO ] Device info: - [ INFO ] NPU - [ INFO ] Build ................................. 2024.1.0-14992-621b025bef4 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 11.51 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,3,224,224] - [ INFO ] Model outputs: - [ INFO ] x.45 (node: aten::linear/Add) : f32 / [...] / [1,1000] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,224,224] - [ INFO ] Model outputs: - [ INFO ] x.45 (node: aten::linear/Add) : f32 / [...] / [1,1000] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 2302.40 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] DEVICE_ID: - [ INFO ] ENABLE_CPU_PINNING: False - [ INFO ] EXECUTION_DEVICES: NPU.3720 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] INTERNAL_SUPPORTED_PROPERTIES: {'CACHING_PROPERTIES': 'RO'} - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] NETWORK_NAME: - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1 - [ INFO ] PERFORMANCE_HINT: PerformanceMode.LATENCY - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 1 - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 1 inference requests, limits: 60000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 7.94 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:NPU.3720 - [ INFO ] Count: 17908 iterations - [ INFO ] Duration: 60004.49 ms - [ INFO ] Latency: - [ INFO ] Median: 3.29 ms - [ INFO ] Average: 3.33 ms - [ INFO ] Min: 3.21 ms - [ INFO ] Max: 6.90 ms - [ INFO ] Throughput: 298.44 FPS - - -Effects of UMD Model Caching -'''''''''''''''''''''''''''' - - - -To see the effects of UMD Model caching, we are going to run the -benchmark_app and see the difference in model read time and compilation -time: - -.. code:: ipython3 - - !benchmark_app -m {model_path} -d NPU -hint latency - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.1.0-14992-621b025bef4 - [ INFO ] - [ INFO ] Device info: - [ INFO ] NPU - [ INFO ] Build ................................. 2024.1.0-14992-621b025bef4 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 11.00 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,3,224,224] - [ INFO ] Model outputs: - [ INFO ] x.45 (node: aten::linear/Add) : f32 / [...] / [1,1000] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,224,224] - [ INFO ] Model outputs: - [ INFO ] x.45 (node: aten::linear/Add) : f32 / [...] / [1,1000] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 2157.58 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] DEVICE_ID: - [ INFO ] ENABLE_CPU_PINNING: False - [ INFO ] EXECUTION_DEVICES: NPU.3720 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] INTERNAL_SUPPORTED_PROPERTIES: {'CACHING_PROPERTIES': 'RO'} - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] NETWORK_NAME: - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1 - [ INFO ] PERFORMANCE_HINT: PerformanceMode.LATENCY - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 1 - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 1 inference requests, limits: 60000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 7.94 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:NPU.3720 - [ INFO ] Count: 17894 iterations - [ INFO ] Duration: 60004.76 ms - [ INFO ] Latency: - [ INFO ] Median: 3.29 ms - [ INFO ] Average: 3.33 ms - [ INFO ] Min: 3.21 ms - [ INFO ] Max: 14.38 ms - [ INFO ] Throughput: 298.21 FPS - - -As you can see from the log entries ``[Step 4/11] Reading model files`` -and ``[Step 7/11] Loading the model to the device``, it takes less time -to read and compile the model after the initial load. - -NPU vs CPU with Throughput Hint -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -.. code:: ipython3 - - !benchmark_app -m {model_path} -d CPU -hint throughput - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.1.0-14992-621b025bef4 - [ INFO ] - [ INFO ] Device info: - [ INFO ] CPU - [ INFO ] Build ................................. 2024.1.0-14992-621b025bef4 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 12.00 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,3,224,224] - [ INFO ] Model outputs: - [ INFO ] x.45 (node: aten::linear/Add) : f32 / [...] / [1,1000] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,224,224] - [ INFO ] Model outputs: - [ INFO ] x.45 (node: aten::linear/Add) : f32 / [...] / [1,1000] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 177.18 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model2 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 4 - [ INFO ] NUM_STREAMS: 4 - [ INFO ] AFFINITY: Affinity.HYBRID_AWARE - [ INFO ] INFERENCE_NUM_THREADS: 16 - [ INFO ] PERF_COUNT: NO - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] ENABLE_CPU_PINNING: False - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 0 - [ INFO ] KV_CACHE_PRECISION: - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 4 inference requests, limits: 60000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 31.62 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 3212 iterations - [ INFO ] Duration: 60082.26 ms - [ INFO ] Latency: - [ INFO ] Median: 65.28 ms - [ INFO ] Average: 74.60 ms - [ INFO ] Min: 35.65 ms - [ INFO ] Max: 157.31 ms - [ INFO ] Throughput: 53.46 FPS - - -.. code:: ipython3 - - !benchmark_app -m {model_path} -d NPU -hint throughput - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.1.0-14992-621b025bef4 - [ INFO ] - [ INFO ] Device info: - [ INFO ] NPU - [ INFO ] Build ................................. 2024.1.0-14992-621b025bef4 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 11.50 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,3,224,224] - [ INFO ] Model outputs: - [ INFO ] x.45 (node: aten::linear/Add) : f32 / [...] / [1,1000] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,224,224] - [ INFO ] Model outputs: - [ INFO ] x.45 (node: aten::linear/Add) : f32 / [...] / [1,1000] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 2265.07 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] DEVICE_ID: - [ INFO ] ENABLE_CPU_PINNING: False - [ INFO ] EXECUTION_DEVICES: NPU.3720 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] INTERNAL_SUPPORTED_PROPERTIES: {'CACHING_PROPERTIES': 'RO'} - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] NETWORK_NAME: - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 4 - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 1 - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 4 inference requests, limits: 60000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 7.95 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:NPU.3720 - [ INFO ] Count: 19080 iterations - [ INFO ] Duration: 60024.79 ms - [ INFO ] Latency: - [ INFO ] Median: 12.51 ms - [ INFO ] Average: 12.56 ms - [ INFO ] Min: 6.92 ms - [ INFO ] Max: 25.80 ms - [ INFO ] Throughput: 317.87 FPS - - -Limitations ------------ - - - -1. Currently, only the models with static shapes are supported on NPU. -2. If the path to the model file includes non-Unicode symbols, such as - in Chinese, the model cannot be used for inference on NPU. It will - return an error. - -Conclusion ----------- - - - -This tutorial demonstrates how easy it is to use NPU in OpenVINO, check -its properties, and even tailor the model performance through the -different performance hints. - -Discover the power of Neural Processing Unit (NPU) with OpenVINO through -these interactive Jupyter notebooks: - -- `hello-world `__: - Start your OpenVINO journey by performing inference on an OpenVINO IR - model. -- `hello-segmentation `__: - Dive into inference with a segmentation model and explore image - segmentation capabilities. - -Model Optimization and Conversion -''''''''''''''''''''''''''''''''' - -- `tflite-to-openvino `__: - Learn the process of converting TensorFlow Lite models to OpenVINO IR - format. -- `yolov7-optimization `__: - Optimize the YOLOv7 model for enhanced performance in OpenVINO. -- `yolov8-optimization `__: - Convert and optimize YOLOv8 models for efficient deployment with - OpenVINO. - -Advanced Computer Vision Techniques -''''''''''''''''''''''''''''''''''' - -- `vision-background-removal `__: - Implement advanced image segmentation and background manipulation - with U^2-Net. -- `handwritten-ocr `__: - Apply optical character recognition to handwritten Chinese and - Japanese text. -- `vehicle-detection-and-recognition `__: - Use pre-trained models for vehicle detection and recognition in - images. -- `vision-image-colorization `__: - Bring black and white images to life by adding color with neural - networks. - -Real-Time Webcam Applications -''''''''''''''''''''''''''''' - -- `tflite-selfie-segmentation `__: - Apply TensorFlow Lite models for selfie segmentation and background - processing. -- `object-detection-webcam `__: - Experience real-time object detection using your webcam and OpenVINO. -- `pose-estimation-webcam `__: - Perform human pose estimation in real-time with webcam integration. -- `action-recognition-webcam `__: - Recognize and classify human actions live with your webcam. -- `style-transfer-webcam `__: - Transform your webcam feed with artistic styles in real-time using - pre-trained models. -- `3D-pose-estimation-webcam `__: - Perform 3D multi-person pose estimation with OpenVINO. diff --git a/docs/notebooks/hello-segmentation-with-output.rst b/docs/notebooks/hello-segmentation-with-output.rst deleted file mode 100644 index ab46b28a838550..00000000000000 --- a/docs/notebooks/hello-segmentation-with-output.rst +++ /dev/null @@ -1,283 +0,0 @@ -Hello Image Segmentation -======================== - -A very basic introduction to using segmentation models with OpenVINO™. - -In this tutorial, a pre-trained -`road-segmentation-adas-0001 `__ -model from the `Open Model -Zoo `__ is used. -ADAS stands for Advanced Driver Assistance Services. The model -recognizes four classes: background, road, curb and mark. - - -**Table of contents:** - - -- `Imports <#imports>`__ -- `Download model weights <#download-model-weights>`__ -- `Select inference device <#select-inference-device>`__ -- `Load the Model <#load-the-model>`__ -- `Load an Image <#load-an-image>`__ -- `Do Inference <#do-inference>`__ -- `Prepare Data for Visualization <#prepare-data-for-visualization>`__ -- `Visualize data <#visualize-data>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - # Install required packages - %pip install -q "openvino>=2023.1.0" opencv-python tqdm "matplotlib>=3.4" - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - - -Imports -------- - - - -.. code:: ipython3 - - import cv2 - import matplotlib.pyplot as plt - import numpy as np - import openvino as ov - from pathlib import Path - - # Fetch `notebook_utils` module - import requests - - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import segmentation_map_to_image, download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("hello-segmentation.ipynb") - -Download model weights ----------------------- - - - -.. code:: ipython3 - - from pathlib import Path - - base_model_dir = Path("./model").expanduser() - - model_name = "road-segmentation-adas-0001" - model_xml_name = f"{model_name}.xml" - model_bin_name = f"{model_name}.bin" - - model_xml_path = base_model_dir / model_xml_name - - if not model_xml_path.exists(): - model_xml_url = ( - "https://storage.openvinotoolkit.org/repositories/open_model_zoo/2023.0/models_bin/1/road-segmentation-adas-0001/FP32/road-segmentation-adas-0001.xml" - ) - model_bin_url = ( - "https://storage.openvinotoolkit.org/repositories/open_model_zoo/2023.0/models_bin/1/road-segmentation-adas-0001/FP32/road-segmentation-adas-0001.bin" - ) - - download_file(model_xml_url, model_xml_name, base_model_dir) - download_file(model_bin_url, model_bin_name, base_model_dir) - else: - print(f"{model_name} already downloaded to {base_model_dir}") - - - -.. parsed-literal:: - - road-segmentation-adas-0001.xml: 0%| | 0.00/389k [00:00`__ dataset -is provided. - -.. code:: ipython3 - - # Download the image from the openvino_notebooks storage - image_filename = Path("data/empty_road_mapillary.jpg") - - - if not image_filename.exists(): - download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/empty_road_mapillary.jpg", - directory="data", - ) - - # The segmentation network expects images in BGR format. - image = cv2.imread(str(image_filename)) - - rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - image_h, image_w, _ = image.shape - - # N,C,H,W = batch size, number of channels, height, width. - N, C, H, W = input_layer_ir.shape - - # OpenCV resize expects the destination size as (width, height). - resized_image = cv2.resize(image, (W, H)) - - # Reshape to the network input shape. - input_image = np.expand_dims(resized_image.transpose(2, 0, 1), 0) - plt.imshow(rgb_image) - - - -.. parsed-literal:: - - empty_road_mapillary.jpg: 0%| | 0.00/227k [00:00 - - - - -.. image:: hello-segmentation-with-output_files/hello-segmentation-with-output_11_2.png - - -Do Inference ------------- - - - -.. code:: ipython3 - - # Run the inference. - result = compiled_model([input_image])[output_layer_ir] - - # Prepare data for visualization. - segmentation_mask = np.argmax(result, axis=1) - plt.imshow(segmentation_mask.transpose(1, 2, 0)) - - - - -.. parsed-literal:: - - - - - - -.. image:: hello-segmentation-with-output_files/hello-segmentation-with-output_13_1.png - - -Prepare Data for Visualization ------------------------------- - - - -.. code:: ipython3 - - # Define colormap, each color represents a class. - colormap = np.array([[68, 1, 84], [48, 103, 141], [53, 183, 120], [199, 216, 52]]) - - # Define the transparency of the segmentation mask on the photo. - alpha = 0.3 - - # Use function from notebook_utils.py to transform mask to an RGB image. - mask = segmentation_map_to_image(segmentation_mask, colormap) - resized_mask = cv2.resize(mask, (image_w, image_h)) - - # Create an image with mask. - image_with_mask = cv2.addWeighted(resized_mask, alpha, rgb_image, 1 - alpha, 0) - -Visualize data --------------- - - - -.. code:: ipython3 - - # Define titles with images. - data = {"Base Photo": rgb_image, "Segmentation": mask, "Masked Photo": image_with_mask} - - # Create a subplot to visualize images. - fig, axs = plt.subplots(1, len(data.items()), figsize=(15, 10)) - - # Fill the subplot. - for ax, (name, image) in zip(axs, data.items()): - ax.axis("off") - ax.set_title(name) - ax.imshow(image) - - # Display an image. - plt.show(fig) - - - -.. image:: hello-segmentation-with-output_files/hello-segmentation-with-output_17_0.png - diff --git a/docs/notebooks/hello-segmentation-with-output_files/hello-segmentation-with-output_11_2.png b/docs/notebooks/hello-segmentation-with-output_files/hello-segmentation-with-output_11_2.png deleted file mode 100644 index 5023362b06be2d..00000000000000 --- a/docs/notebooks/hello-segmentation-with-output_files/hello-segmentation-with-output_11_2.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:96f0eb3a9535d57b8784be4b717dc9f280e4bf107e5b61d7cf51b36e142e4c7a -size 249032 diff --git a/docs/notebooks/hello-segmentation-with-output_files/hello-segmentation-with-output_13_1.png b/docs/notebooks/hello-segmentation-with-output_files/hello-segmentation-with-output_13_1.png deleted file mode 100644 index fe6d042ef77d30..00000000000000 --- a/docs/notebooks/hello-segmentation-with-output_files/hello-segmentation-with-output_13_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:caef59a6c15a5a1d512f4dd22395b12fbd754bba264ea5f0deae323ff8edee39 -size 20550 diff --git a/docs/notebooks/hello-segmentation-with-output_files/hello-segmentation-with-output_17_0.png b/docs/notebooks/hello-segmentation-with-output_files/hello-segmentation-with-output_17_0.png deleted file mode 100644 index 310b0d3545d48c..00000000000000 --- a/docs/notebooks/hello-segmentation-with-output_files/hello-segmentation-with-output_17_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6a3137d9359a44fb19e1900e6b808f9e7e7ded0ba209abe8c4bd90fcf37b1c6a -size 260045 diff --git a/docs/notebooks/hello-world-with-output.rst b/docs/notebooks/hello-world-with-output.rst deleted file mode 100644 index ee6ae64726d19c..00000000000000 --- a/docs/notebooks/hello-world-with-output.rst +++ /dev/null @@ -1,233 +0,0 @@ -Hello Image Classification -========================== - -This basic introduction to OpenVINO™ shows how to do inference with an -image classification model. - -A pre-trained `MobileNetV3 -model `__ -from `Open Model -Zoo `__ is used in -this tutorial. For more information about how OpenVINO IR models are -created, refer to the `TensorFlow to -OpenVINO `__ -tutorial. - - -**Table of contents:** - - -- `Imports <#imports>`__ -- `Download the Model and data - samples <#download-the-model-and-data-samples>`__ -- `Select inference device <#select-inference-device>`__ -- `Load the Model <#load-the-model>`__ -- `Load an Image <#load-an-image>`__ -- `Do Inference <#do-inference>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - # Install required packages - %pip install -q "openvino>=2023.1.0" opencv-python tqdm "matplotlib>=3.4" - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - - -Imports -------- - - - -.. code:: ipython3 - - from pathlib import Path - - import cv2 - import matplotlib.pyplot as plt - import numpy as np - import openvino as ov - - # Fetch `notebook_utils` module - import requests - - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("hello-world.ipynb") - -Download the Model and data samples ------------------------------------ - - - -.. code:: ipython3 - - base_artifacts_dir = Path("./artifacts").expanduser() - - model_name = "v3-small_224_1.0_float" - model_xml_name = f"{model_name}.xml" - model_bin_name = f"{model_name}.bin" - - model_xml_path = base_artifacts_dir / model_xml_name - - base_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/mobelinet-v3-tf/FP32/" - - if not model_xml_path.exists(): - download_file(base_url + model_xml_name, model_xml_name, base_artifacts_dir) - download_file(base_url + model_bin_name, model_bin_name, base_artifacts_dir) - else: - print(f"{model_name} already downloaded to {base_artifacts_dir}") - - - -.. parsed-literal:: - - v3-small_224_1.0_float.xml: 0%| | 0.00/294k [00:00`__ is a -central repository for pre-trained deep learning models. It allows -exploration and provides access to thousands of models for a wide range -of tasks, including text classification, question answering, and image -classification. Hugging Face provides Python packages that serve as APIs -and tools to easily download and fine tune state-of-the-art pretrained -models, namely -`transformers `__ and -`diffusers `__ packages. - -.. image:: https://github.com/huggingface/optimum-intel/raw/main/readme_logo.png - -Throughout this notebook we will learn: 1. How to load a HF pipeline -using the ``transformers`` package and then convert it to OpenVINO. 2. -How to load the same pipeline using Optimum Intel package. - - -**Table of contents:** - - -- `Converting a Model from the HF Transformers - Package <#converting-a-model-from-the-hf-transformers-package>`__ - - - `Installing Requirements <#installing-requirements>`__ - - `Imports <#imports>`__ - - `Initializing a Model Using the HF Transformers - Package <#initializing-a-model-using-the-hf-transformers-package>`__ - - `Original Model inference <#original-model-inference>`__ - - `Converting the Model to OpenVINO IR - format <#converting-the-model-to-openvino-ir-format>`__ - - `Converted Model Inference <#converted-model-inference>`__ - -- `Converting a Model Using the Optimum Intel - Package <#converting-a-model-using-the-optimum-intel-package>`__ - - - `Install Requirements for - Optimum <#install-requirements-for-optimum>`__ - - `Import Optimum <#import-optimum>`__ - - `Initialize and Convert the Model Automatically using OVModel - class <#initialize-and-convert-the-model-automatically-using-ovmodel-class>`__ - - `Convert model using Optimum CLI - interface <#convert-model-using-optimum-cli-interface>`__ - - `The Optimum Model Inference <#the-optimum-model-inference>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Converting a Model from the HF Transformers Package ---------------------------------------------------- - - - -Hugging Face transformers package provides API for initializing a model -and loading a set of pre-trained weights using the model text handle. -Discovering a desired model name is straightforward with `HF website’s -Models page `__, one can choose a model -solving a particular machine learning problem and even sort the models -by popularity and novelty. - -Installing Requirements -~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "transformers>=4.33.0" "torch>=2.1.0" - %pip install -q ipywidgets - %pip install -q "openvino>=2023.1.0" "Pillow" - -Imports -~~~~~~~ - - - -.. code:: ipython3 - - from pathlib import Path - - import numpy as np - import torch - - from transformers import AutoModelForSequenceClassification - from transformers import AutoTokenizer - - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("hugging-face-hub.ipynb") - -Initializing a Model Using the HF Transformers Package -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We will use `roberta text sentiment -classification `__ -model in our example, it is a transformer-based encoder model pretrained -in a special way, please refer to the model card to learn more. - -Following the instructions on the model page, we use -``AutoModelForSequenceClassification`` to initialize the model and -perform inference with it. To find more information on HF pipelines and -model initialization please refer to `HF -tutorials `__. - -.. code:: ipython3 - - MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest" - - tokenizer = AutoTokenizer.from_pretrained(MODEL, return_dict=True) - - # The torchscript=True flag is used to ensure the model outputs are tuples - # instead of ModelOutput (which causes JIT errors). - model = AutoModelForSequenceClassification.from_pretrained(MODEL, torchscript=True) - -Original Model inference -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Let’s do a classification of a simple prompt below. - -.. code:: ipython3 - - text = "HF models run perfectly with OpenVINO!" - - encoded_input = tokenizer(text, return_tensors="pt") - output = model(**encoded_input) - scores = output[0][0] - scores = torch.softmax(scores, dim=0).numpy(force=True) - - - def print_prediction(scores): - for i, descending_index in enumerate(scores.argsort()[::-1]): - label = model.config.id2label[descending_index] - score = np.round(float(scores[descending_index]), 4) - print(f"{i+1}) {label} {score}") - - - print_prediction(scores) - - -.. parsed-literal:: - - 1) positive 0.9485 - 2) neutral 0.0484 - 3) negative 0.0031 - - -Converting the Model to OpenVINO IR format -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -We use the OpenVINO `Model -conversion -API `__ -to convert the model (this one is implemented in PyTorch) to OpenVINO -Intermediate Representation (IR). - -Note how we reuse our real ``encoded_input``, passing it to the -``ov.convert_model`` function. It will be used for model tracing. - -.. code:: ipython3 - - import openvino as ov - - save_model_path = Path("./models/model.xml") - - if not save_model_path.exists(): - ov_model = ov.convert_model(model, example_input=dict(encoded_input)) - ov.save_model(ov_model, save_model_path) - -Converted Model Inference -~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -First, we pick a device to do the model inference - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO') - - - -OpenVINO model IR must be compiled for a specific device prior to the -model inference. - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - - compiled_model = core.compile_model(save_model_path, device.value) - - # Compiled model call is performed using the same parameters as for the original model - scores_ov = compiled_model(encoded_input.data)[0] - - scores_ov = torch.softmax(torch.tensor(scores_ov[0]), dim=0).detach().numpy() - - print_prediction(scores_ov) - - -.. parsed-literal:: - - 1) positive 0.9483 - 2) neutral 0.0485 - 3) negative 0.0031 - - -Note the prediction of the converted model match exactly the one of the -original model. - -This is a rather simple example as the pipeline includes just one -encoder model. Contemporary state of the art pipelines often consist of -several model, feel free to explore other OpenVINO tutorials: 1. `Stable -Diffusion v2 `__ 2. `Zero-shot Image -Classification with OpenAI -CLIP `__ 3. `Controllable Music -Generation with MusicGen `__ - -The workflow for the ``diffusers`` package is exactly the same. The -first example in the list above relies on the ``diffusers``. - -Converting a Model Using the Optimum Intel Package --------------------------------------------------- - - - -Optimum Intel is the interface between the Transformers and -Diffusers libraries and the different tools and libraries provided by -Intel to accelerate end-to-end pipelines on Intel architectures. - -Among other use cases, Optimum Intel provides a simple interface to -optimize your Transformers and Diffusers models, convert them to the -OpenVINO Intermediate Representation (IR) format and run inference using -OpenVINO Runtime. - -Install Requirements for Optimum -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" - -Import Optimum -~~~~~~~~~~~~~~ - - - -Documentation for Optimum Intel states: >You can now easily perform -inference with OpenVINO Runtime on a variety of Intel processors (see -the full list of supported devices). For that, just replace the -``AutoModelForXxx`` class with the corresponding ``OVModelForXxx`` -class. - -You can find more information in `Optimum Intel -documentation `__. - -.. code:: ipython3 - - from optimum.intel.openvino import OVModelForSequenceClassification - -Initialize and Convert the Model Automatically using OVModel class -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To load a Transformers model and convert it to the OpenVINO format on -the fly, you can set ``export=True`` when loading your model. The model -can be saved in OpenVINO format using ``save_pretrained`` method and -specifying a directory for storing the model as an argument. For the -next usage, you can avoid the conversion step and load the saved early -model from disk using ``from_pretrained`` method without export -specification. We also specified ``device`` parameter for compiling the -model on the specific device, if not provided, the default device will -be used. The device can be changed later in runtime using -``model.to(device)``, please note that it may require some time for -model compilation on a newly selected device. In some cases, it can be -useful to separate model initialization and compilation, for example, if -you want to reshape the model using ``reshape`` method, you can postpone -compilation, providing the parameter ``compile=False`` into -``from_pretrained`` method, compilation can be performed manually using -``compile`` method or will be performed automatically during first -inference run. - -.. code:: ipython3 - - model = OVModelForSequenceClassification.from_pretrained(MODEL, export=True, device=device.value) - - # The save_pretrained() method saves the model weights to avoid conversion on the next load. - model.save_pretrained("./models/optimum_model") - -Convert model using Optimum CLI interface -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Alternatively, you can use the Optimum CLI interface for converting -models (supported starting optimum-intel 1.12 version). General command -format: - -.. code:: bash - - optimum-cli export openvino --model --task - -where task is task to export the model for, if not specified, the task -will be auto-inferred based on the model. Available tasks depend on the -model, but are among: [‘default’, ‘fill-mask’, ‘text-generation’, -‘text2text-generation’, ‘text-classification’, ‘token-classification’, -‘multiple-choice’, ‘object-detection’, ‘question-answering’, -‘image-classification’, ‘image-segmentation’, ‘masked-im’, -‘semantic-segmentation’, ‘automatic-speech-recognition’, -‘audio-classification’, ‘audio-frame-classification’, -‘automatic-speech-recognition’, ‘audio-xvector’, ‘image-to-text’, -‘stable-diffusion’, ‘zero-shot-object-detection’]. For decoder models, -use ``xxx-with-past`` to export the model using past key values in the -decoder. - -You can find a mapping between tasks and model classes in Optimum -TaskManager -`documentation `__. - -Additionally, you can specify weights compression using -``--weight-format`` argument with one of following options: ``fp32``, -``fp16``, ``int8`` and ``int4``. For int8 and int4 nncf will be used for -weight compression. - -Full list of supported arguments available via ``--help`` - -.. code:: ipython3 - - !optimum-cli export openvino --help - - -.. parsed-literal:: - - huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... - To disable this warning, you can either: - - Avoid using `tokenizers` before the fork if possible - - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) - - -.. parsed-literal:: - - 2024-07-17 09:40:40.173915: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] - [--framework {pt,tf}] [--trust-remote-code] - [--weight-format {fp32,fp16,int8,int4,int4_sym_g128,int4_asym_g128,int4_sym_g64,int4_asym_g64}] - [--library {transformers,diffusers,timm,sentence_transformers}] - [--cache_dir CACHE_DIR] - [--pad-token-id PAD_TOKEN_ID] - [--ratio RATIO] [--sym] - [--group-size GROUP_SIZE] - [--dataset DATASET] [--all-layers] [--awq] - [--scale-estimation] - [--sensitivity-metric SENSITIVITY_METRIC] - [--num-samples NUM_SAMPLES] - [--disable-stateful] - [--disable-convert-tokenizer] - output - - optional arguments: - -h, --help show this help message and exit - - Required arguments: - -m MODEL, --model MODEL - Model ID on huggingface.co or path on disk to load - model from. - output Path indicating the directory where to store the - generated OV model. - - Optional arguments: - --task TASK The task to export the model for. If not specified, - the task will be auto-inferred based on the model. - Available tasks depend on the model, but are among: - ['image-to-text', 'audio-frame-classification', 'text- - generation', 'fill-mask', 'image-segmentation', - 'audio-xvector', 'semantic-segmentation', 'depth- - estimation', 'token-classification', 'zero-shot-image- - classification', 'zero-shot-object-detection', - 'text2text-generation', 'sentence-similarity', - 'feature-extraction', 'conversational', 'image- - classification', 'text-to-audio', 'stable-diffusion', - 'image-to-image', 'text-classification', 'automatic- - speech-recognition', 'multiple-choice', 'masked-im', - 'mask-generation', 'question-answering', 'object- - detection', 'audio-classification', 'stable-diffusion- - xl']. For decoder models, use `xxx-with-past` to - export the model using past key values in the decoder. - --framework {pt,tf} The framework to use for the export. If not provided, - will attempt to use the local checkpoint's original - framework or what is available in the environment. - --trust-remote-code Allows to use custom code for the modeling hosted in - the model repository. This option should only be set - for repositories you trust and in which you have read - the code, as it will execute on your local machine - arbitrary code present in the model repository. - --weight-format {fp32,fp16,int8,int4,int4_sym_g128,int4_asym_g128,int4_sym_g64,int4_asym_g64} - he weight format of the exported model. - --library {transformers,diffusers,timm,sentence_transformers} - The library used to load the model before export. If - not provided, will attempt to infer the local - checkpoint's library - --cache_dir CACHE_DIR - The path to a directory in which the downloaded model - should be cached if the standard cache should not be - used. - --pad-token-id PAD_TOKEN_ID - This is needed by some models, for some tasks. If not - provided, will attempt to use the tokenizer to guess - it. - --ratio RATIO A parameter used when applying 4-bit quantization to - control the ratio between 4-bit and 8-bit - quantization. If set to 0.8, 80% of the layers will be - quantized to int4 while 20% will be quantized to int8. - This helps to achieve better accuracy at the sacrifice - of the model size and inference latency. Default value - is 1.0. - --sym Whether to apply symmetric quantization - --group-size GROUP_SIZE - The group size to use for quantization. Recommended - value is 128 and -1 uses per-column quantization. - --dataset DATASET The dataset used for data-aware compression or - quantization with NNCF. You can use the one from the - list ['wikitext2','c4','c4-new'] for language models - or ['conceptual_captions','laion/220k-GPT4Vision- - captions-from-LIVIS','laion/filtered-wit'] for - diffusion models. - --all-layers Whether embeddings and last MatMul layers should be - compressed to INT4. If not provided an weight - compression is applied, they are compressed to INT8. - --awq Whether to apply AWQ algorithm. AWQ improves - generation quality of INT4-compressed LLMs, but - requires additional time for tuning weights on a - calibration dataset. To run AWQ, please also provide a - dataset argument. Note: it's possible that there will - be no matching patterns in the model to apply AWQ, in - such case it will be skipped. - --scale-estimation Indicates whether to apply a scale estimation - algorithm that minimizes the L2 error between the - original and compressed layers. Providing a dataset is - required to run scale estimation. Please note, that - applying scale estimation takes additional memory and - time. - --sensitivity-metric SENSITIVITY_METRIC - The sensitivity metric for assigning quantization - precision to layers. Can be one of the following: - ['weight_quantization_error', - 'hessian_input_activation', - 'mean_activation_variance', 'max_activation_variance', - 'mean_activation_magnitude']. - --num-samples NUM_SAMPLES - The maximum number of samples to take from the dataset - for quantization. - --disable-stateful Disable stateful converted models, stateless models - will be generated instead. Stateful models are - produced by default when this key is not used. In - stateful models all kv-cache inputs and outputs are - hidden in the model and are not exposed as model - inputs and outputs. If --disable-stateful option is - used, it may result in sub-optimal inference - performance. Use it when you intentionally want to use - a stateless model, for example, to be compatible with - existing OpenVINO native inference code that expects - kv-cache inputs and outputs in the model. - --disable-convert-tokenizer - Do not add converted tokenizer and detokenizer - OpenVINO models. - - -The command line export for model from example above with FP16 weights -compression: - -.. code:: ipython3 - - !optimum-cli export openvino --model $MODEL --task text-classification --weight-format fp16 models/optimum_model/fp16 - -After export, model will be available in the specified directory and can -be loaded using the same OVModelForXXX class. - -.. code:: ipython3 - - model = OVModelForSequenceClassification.from_pretrained("models/optimum_model/fp16", device=device.value) - - -.. parsed-literal:: - - Compiling the model to AUTO ... - - -There are some models in the Hugging Face Models Hub, that are already -converted and ready to run! You can filter those models out by library -name, just type OpenVINO, or follow `this -link `__. - -The Optimum Model Inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Model inference is exactly the same as for the original model! - -.. code:: ipython3 - - output = model(**encoded_input) - scores = output[0][0] - scores = torch.softmax(scores, dim=0).numpy(force=True) - - print_prediction(scores) - - -.. parsed-literal:: - - 1) positive 0.9483 - 2) neutral 0.0485 - 3) negative 0.0031 - - -You can find more examples of using Optimum Intel here: 1. `Accelerate -Inference of Sparse Transformer -Models `__ 2. -`Grammatical Error Correction with -OpenVINO `__ 3. `Stable -Diffusion v2.1 using Optimum-Intel -OpenVINO `__ -4. `Image generation with Stable Diffusion -XL `__ 5. `Create LLM-powered Chatbot using -OpenVINO `__ 6. `Document Visual Question Answering -Using Pix2Struct and OpenVINO `__ 7. `Automatic -speech recognition using Distil-Whisper and -OpenVINO `__ diff --git a/docs/notebooks/hunyuan-dit-image-generation-with-output.rst b/docs/notebooks/hunyuan-dit-image-generation-with-output.rst deleted file mode 100644 index 72f8f946277fab..00000000000000 --- a/docs/notebooks/hunyuan-dit-image-generation-with-output.rst +++ /dev/null @@ -1,945 +0,0 @@ -Image generation with HunyuanDIT and OpenVINO -============================================= - -Hunyuan-DiT is a powerful text-to-image diffusion transformer with -fine-grained understanding of both English and Chinese. - -|image0| - -The model architecture expertly blends diffusion models and transformer -networks to unlock the potential of text-to-image generation. The -diffusion transformer consists of encoder and decoder blocks that work -together to translate the text prompt into a visual representation. Each -block contains three key modules: self-attention, cross-attention, and a -feed-forward network. Self-attention analyzes relationships within the -image, while cross-attention fuses the text encoding from CLIP and T5, -guiding the image generation process based on the user’s input. The -Hunyuan-DiT block, specifically, consists of these encoder and decoder -blocks. The encoder block processes the image patches, capturing -patterns and dependencies, while the decoder reconstructs the image from -the encoded information. The decoder also includes a skip module that -directly connects to the encoder, facilitating information flow and -enhancing detail reconstruction. Rotary Positional Embedding (RoPE) -ensures that the model understands the spatial relationships between -image patches, accurately reconstructing the visual composition. -Additionally, Centralized Interpolative Positional Encoding enables -multi-resolution training, allowing Hunyuan-DiT to handle various image -sizes seamlessly. - -More details about model can be found in original -`repository `__, `project web -page `__ and -`paper `__. - -In this tutorial we consider how to convert and run Hunyuan-DIT model -using OpenVINO. Additionally, we will use -`NNCF `__ for optimizing model -in low precision. - -**Table of contents:** - -- `Prerequisites <#prerequisites>`__ -- `Download PyTorch model <#download-pytorch-model>`__ -- `Build PyTorch pipeline <#build-pytorch-pipeline>`__ -- `Convert and Optimize models with OpenVINO and - NNCF <#convert-and-optimize-models-with-openvino-and-nncf>`__ - - - `DiT <#dit>`__ - - `Text Encoder <#text-encoder>`__ - - `Text Embedder <#text-embedder>`__ - - `VAE Decoder <#vae-decoder>`__ - -- `Create Inference pipeline <#create-inference-pipeline>`__ - - - `Run model <#run-model>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. |image0| image:: https://raw.githubusercontent.com/Tencent/HunyuanDiT/main/asset/framework.png - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "torch>=2.1" torchvision einops timm peft accelerate transformers diffusers huggingface-hub tokenizers sentencepiece protobuf loguru --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "nncf>=2.12" "gradio>=4.19" "pillow" "opencv-python" - %pip install -Uq "openvino>=2024.3.0" - -.. code:: ipython3 - - from pathlib import Path - import requests - - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("hunyuan-dit-image-generation.ipynb") - -.. code:: ipython3 - - from cmd_helper import clone_repo - - - repo_dir = clone_repo("https://github.com/tencent/HunyuanDiT", "ebfb7936490287616c38519f87084a34a1d75362") - -Download PyTorch model ----------------------- - - - -For starting work with model, we should download it from HuggingFace -Hub. We will use -`Distilled `__ -version of -`hunyuan-DIT `__. In -the first time, model downloading may take some time. - -.. code:: ipython3 - - import huggingface_hub as hf_hub - - weights_dir = Path("ckpts") - weights_dir.mkdir(exist_ok=True) - models_dir = Path("models") - models_dir.mkdir(exist_ok=True) - - OV_DIT_MODEL = models_dir / "dit.xml" - OV_TEXT_ENCODER = models_dir / "text_encoder.xml" - OV_TEXT_EMBEDDER = models_dir / "text_embedder.xml" - OV_VAE_DECODER = models_dir / "vae_decoder.xml" - - model_conversion_required = not all([OV_DIT_MODEL.exists(), OV_TEXT_ENCODER.exists(), OV_TEXT_EMBEDDER.exists(), OV_VAE_DECODER.exists()]) - distilled_repo_id = "Tencent-Hunyuan/Distillation" - orig_repo_id = "Tencent-Hunyuan/HunyuanDiT" - - if model_conversion_required and not (weights_dir / "t2i").exists(): - hf_hub.snapshot_download(repo_id=orig_repo_id, local_dir=weights_dir, allow_patterns=["t2i/*"], ignore_patterns=["t2i/model/*"]) - hf_hub.hf_hub_download(repo_id=distilled_repo_id, filename="pytorch_model_distill.pt", local_dir=weights_dir / "t2i/model") - -Build PyTorch pipeline ----------------------- - - - -The code bellow, initialize PyTorch inference pipeline for hunyuan-DIT -model. - -.. code:: ipython3 - - from hydit.inference import End2End - from hydit.config import get_args - - gen = None - - if model_conversion_required: - args = get_args({}) - args.load_key = "distill" - args.model_root = weights_dir - - # Load models - gen = End2End(args, weights_dir) - - -.. parsed-literal:: - - /home/ea/work/notebooks_env/lib/python3.8/site-packages/diffusers/models/transformers/transformer_2d.py:34: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 1.0.0. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.modeling_outputs import Transformer2DModelOutput`, instead. - deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message) - - -.. parsed-literal:: - - flash_attn import failed: No module named 'flash_attn' - - -Convert and Optimize models with OpenVINO and NNCF --------------------------------------------------- - - - -Starting from 2023.0 release, OpenVINO supports PyTorch models directly -via Model Conversion API. ``ov.convert_model`` function accepts instance -of PyTorch model and example inputs for tracing and returns object of -``ov.Model`` class, ready to use or save on disk using ``ov.save_model`` -function. - -The pipeline consists of four important parts: - -- Clip and T5 Text Encoder to create condition to generate an image - from a text prompt. -- DIT for step-by-step denoising latent image representation. -- Autoencoder (VAE) for decoding latent space to image. - -For reducing model memory consumption and improving performance we will -use weights compression. The `Weights -Compression `__ -algorithm is aimed at compressing the weights of the models and can be -used to optimize the model footprint and performance of large models -where the size of weights is relatively larger than the size of -activations, for example, Large Language Models (LLM). Compared to INT8 -compression, INT4 compression improves performance even more, but -introduces a minor drop in prediction quality. - -Let us convert and optimize each part: - -DiT -~~~ - - - -.. code:: ipython3 - - import torch - import nncf - import gc - import openvino as ov - - - def cleanup_torchscript_cache(): - """ - Helper for removing cached model representation - """ - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - - - if not OV_DIT_MODEL.exists(): - latent_model_input = torch.randn(2, 4, 64, 64) - t_expand = torch.randint(0, 1000, [2]) - prompt_embeds = torch.randn(2, 77, 1024) - attention_mask = torch.randint(0, 2, [2, 77]) - prompt_embeds_t5 = torch.randn(2, 256, 2048) - attention_mask_t5 = torch.randint(0, 2, [2, 256]) - ims = torch.tensor([[512, 512, 512, 512, 0, 0], [512, 512, 512, 512, 0, 0]]) - style = torch.tensor([0, 0]) - freqs_cis_img = ( - torch.randn(1024, 88), - torch.randn(1024, 88), - ) - model_args = ( - latent_model_input, - t_expand, - prompt_embeds, - attention_mask, - prompt_embeds_t5, - attention_mask_t5, - ims, - style, - freqs_cis_img[0], - freqs_cis_img[1], - ) - - gen.model.to(torch.device("cpu")) - gen.model.to(torch.float32) - gen.model.args.use_fp16 = False - ov_model = ov.convert_model(gen.model, example_input=model_args) - ov_model = nncf.compress_weights(ov_model, mode=nncf.CompressWeightsMode.INT4_SYM, ratio=0.8, group_size=64) - ov.save_model(ov_model, OV_DIT_MODEL) - del ov_model - cleanup_torchscript_cache() - del gen.model - gc.collect() - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino - - -Text Encoder -~~~~~~~~~~~~ - - - -.. code:: ipython3 - - if not OV_TEXT_ENCODER.exists(): - gen.clip_text_encoder.to("cpu") - gen.clip_text_encoder.to(torch.float32) - ov_model = ov.convert_model( - gen.clip_text_encoder, example_input={"input_ids": torch.ones([1, 77], dtype=torch.int64), "attention_mask": torch.ones([1, 77], dtype=torch.int64)} - ) - ov_model = nncf.compress_weights(ov_model, mode=nncf.CompressWeightsMode.INT4_SYM, ratio=0.8, group_size=64) - ov.save_model(ov_model, OV_TEXT_ENCODER) - del ov_model - cleanup_torchscript_cache() - del gen.clip_text_encoder - gc.collect() - -Text Embedder -~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - if not OV_TEXT_EMBEDDER.exists(): - gen.embedder_t5.model.to("cpu") - gen.embedder_t5.model.to(torch.float32) - - ov_model = ov.convert_model(gen.embedder_t5, example_input=(torch.ones([1, 256], dtype=torch.int64), torch.ones([1, 256], dtype=torch.int64))) - ov_model = nncf.compress_weights(ov_model, mode=nncf.CompressWeightsMode.INT4_SYM, ratio=0.8, group_size=64) - ov.save_model(ov_model, OV_TEXT_EMBEDDER) - del ov_model - cleanup_torchscript_cache() - del gen.embedder_t5 - gc.collect() - -VAE Decoder -~~~~~~~~~~~ - - - -.. code:: ipython3 - - if not OV_VAE_DECODER.exists(): - vae_decoder = gen.vae - vae_decoder.to("cpu") - vae_decoder.to(torch.float32) - - vae_decoder.forward = vae_decoder.decode - - ov_model = ov.convert_model(vae_decoder, example_input=torch.zeros((1, 4, 128, 128))) - ov.save_model(ov_model, OV_VAE_DECODER) - del ov_model - cleanup_torchscript_cache() - del vae_decoder - del gen.vae - gc.collect() - -.. code:: ipython3 - - del gen - gc.collect(); - -Create Inference pipeline -------------------------- - - - -.. code:: ipython3 - - import inspect - from typing import Any, Callable, Dict, List, Optional, Union - - import torch - from diffusers.configuration_utils import FrozenDict - from diffusers.image_processor import VaeImageProcessor - from diffusers.models import AutoencoderKL, UNet2DConditionModel - from diffusers.pipelines.pipeline_utils import DiffusionPipeline - from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput - from diffusers.schedulers import KarrasDiffusionSchedulers - from diffusers.utils.torch_utils import randn_tensor - from transformers import BertModel, BertTokenizer - from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer - - - def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): - """ - Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and - Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 - """ - std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True) - std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True) - # rescale the results from guidance (fixes overexposure) - noise_pred_rescaled = noise_cfg * (std_text / std_cfg) - # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images - noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg - return noise_cfg - - - class OVHyDiTPipeline(DiffusionPipeline): - def __init__( - self, - vae: AutoencoderKL, - text_encoder: Union[BertModel, CLIPTextModel], - tokenizer: Union[BertTokenizer, CLIPTokenizer], - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - feature_extractor: CLIPImageProcessor, - progress_bar_config: Dict[str, Any] = None, - embedder_t5=None, - embedder_tokenizer=None, - ): - self.embedder_t5 = embedder_t5 - self.embedder_tokenizer = embedder_tokenizer - - if progress_bar_config is None: - progress_bar_config = {} - if not hasattr(self, "_progress_bar_config"): - self._progress_bar_config = {} - self._progress_bar_config.update(progress_bar_config) - - if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: - new_config = dict(scheduler.config) - new_config["steps_offset"] = 1 - scheduler._internal_dict = FrozenDict(new_config) - - if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: - new_config = dict(scheduler.config) - new_config["clip_sample"] = False - scheduler._internal_dict = FrozenDict(new_config) - - self.vae = vae - self.text_encoder = text_encoder - self.tokenizer = tokenizer - self.unet = unet - self.scheduler = scheduler - self.feature_extractor = feature_extractor - self.vae_scale_factor = 2**3 - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - - def encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - embedder=None, - ): - r""" - Encodes the prompt into text encoder hidden states. - - Args: - prompt (`str` or `List[str]`, *optional*): - prompt to be encoded - num_images_per_prompt (`int`): - number of images that should be generated per prompt - do_classifier_free_guidance (`bool`): - whether to use classifier free guidance or not - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - embedder: - T5 embedder - """ - if embedder is None: - text_encoder = self.text_encoder - tokenizer = self.tokenizer - max_length = self.tokenizer.model_max_length - else: - text_encoder = embedder - tokenizer = self.embedder_tokenizer - max_length = 256 - - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if prompt_embeds is None: - text_inputs = tokenizer( - prompt, - padding="max_length", - max_length=max_length, - truncation=True, - return_attention_mask=True, - return_tensors="pt", - ) - text_input_ids = text_inputs.input_ids - attention_mask = text_inputs.attention_mask - - prompt_embeds = text_encoder([text_input_ids, attention_mask]) - prompt_embeds = torch.from_numpy(prompt_embeds[0]) - attention_mask = attention_mask.repeat(num_images_per_prompt, 1) - else: - attention_mask = None - - bs_embed, seq_len, _ = prompt_embeds.shape - # duplicate text embeddings for each generation per prompt, using mps friendly method - prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) - prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) - - # get unconditional embeddings for classifier free guidance - if do_classifier_free_guidance and negative_prompt_embeds is None: - uncond_tokens: List[str] - if negative_prompt is None: - uncond_tokens = [""] * batch_size - elif prompt is not None and type(prompt) is not type(negative_prompt): - raise TypeError(f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" f" {type(prompt)}.") - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] - elif batch_size != len(negative_prompt): - raise ValueError( - f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" - f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`." - ) - else: - uncond_tokens = negative_prompt - - max_length = prompt_embeds.shape[1] - uncond_input = tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="pt", - ) - uncond_attention_mask = uncond_input.attention_mask - negative_prompt_embeds = text_encoder([uncond_input.input_ids, uncond_attention_mask]) - negative_prompt_embeds = torch.from_numpy(negative_prompt_embeds[0]) - uncond_attention_mask = uncond_attention_mask.repeat(num_images_per_prompt, 1) - else: - uncond_attention_mask = None - - if do_classifier_free_guidance: - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - seq_len = negative_prompt_embeds.shape[1] - - negative_prompt_embeds = negative_prompt_embeds - - negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) - negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - - return prompt_embeds, negative_prompt_embeds, attention_mask, uncond_attention_mask - - def prepare_extra_step_kwargs(self, generator, eta): - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta - - # check if the scheduler accepts generator - accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) - if accepts_generator: - extra_step_kwargs["generator"] = generator - return extra_step_kwargs - - def check_inputs( - self, - prompt, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, - ): - if height % 8 != 0 or width % 8 != 0: - raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - - if (callback_steps is None) or (callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)): - raise ValueError(f"`callback_steps` has to be a positive integer but is {callback_steps} of type" f" {type(callback_steps)}.") - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError("Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.") - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - - def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None): - shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - - if latents is None: - latents = randn_tensor(shape, generator=generator, device=torch.device("cpu"), dtype=dtype) - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * self.scheduler.init_noise_sigma - return latents - - def __call__( - self, - height: int, - width: int, - prompt: Union[str, List[str]] = None, - num_inference_steps: Optional[int] = 50, - guidance_scale: Optional[float] = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: Optional[int] = 1, - eta: Optional[float] = 0.0, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_embeds_t5: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds_t5: Optional[torch.FloatTensor] = None, - output_type: Optional[str] = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor, torch.FloatTensor], None]] = None, - callback_steps: int = 1, - guidance_rescale: float = 0.0, - image_meta_size: Optional[torch.LongTensor] = None, - style: Optional[torch.LongTensor] = None, - freqs_cis_img: Optional[tuple] = None, - learn_sigma: bool = True, - ): - # 1. Check inputs. Raise error if not correct - self.check_inputs(prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) - - # 2. Define call parameters - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance.from - do_classifier_free_guidance = guidance_scale > 1.0 - - prompt_embeds, negative_prompt_embeds, attention_mask, uncond_attention_mask = self.encode_prompt( - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - ) - prompt_embeds_t5, negative_prompt_embeds_t5, attention_mask_t5, uncond_attention_mask_t5 = self.encode_prompt( - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds_t5, - negative_prompt_embeds=negative_prompt_embeds_t5, - embedder=self.embedder_t5, - ) - - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - if do_classifier_free_guidance: - prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) - attention_mask = torch.cat([uncond_attention_mask, attention_mask]) - prompt_embeds_t5 = torch.cat([negative_prompt_embeds_t5, prompt_embeds_t5]) - attention_mask_t5 = torch.cat([uncond_attention_mask_t5, attention_mask_t5]) - - # 4. Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps, device=torch.device("cpu")) - timesteps = self.scheduler.timesteps - - # 5. Prepare latent variables - num_channels_latents = 4 - latents = self.prepare_latents( - batch_size * num_images_per_prompt, - num_channels_latents, - height, - width, - prompt_embeds.dtype, - generator, - latents, - ) - - # 6. Prepare extra step kwargs. - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - - # 7. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - # expand scalar t to 1-D tensor to match the 1st dim of latent_model_input - t_expand = torch.tensor([t] * latent_model_input.shape[0], device=latent_model_input.device) - - ims = image_meta_size if image_meta_size is not None else torch.tensor([[1024, 1024, 1024, 1024, 0, 0], [1024, 1024, 1024, 1024, 0, 0]]) - - noise_pred = torch.from_numpy( - self.unet( - [ - latent_model_input, - t_expand, - prompt_embeds, - attention_mask, - prompt_embeds_t5, - attention_mask_t5, - ims, - style, - freqs_cis_img[0], - freqs_cis_img[1], - ] - )[0] - ) - if learn_sigma: - noise_pred, _ = noise_pred.chunk(2, dim=1) - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - if do_classifier_free_guidance and guidance_rescale > 0.0: - # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf - noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) - - # compute the previous noisy sample x_t -> x_t-1 - results = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=True) - latents = results.prev_sample - pred_x0 = results.pred_original_sample if hasattr(results, "pred_original_sample") else None - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - if callback is not None and i % callback_steps == 0: - callback(i, t, latents, pred_x0) - - has_nsfw_concept = None - if not output_type == "latent": - image = torch.from_numpy(self.vae(latents / 0.13025)[0]) - else: - image = latents - - if has_nsfw_concept is None: - do_denormalize = [True] * image.shape[0] - else: - do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - - image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) - - if not return_dict: - return (image, has_nsfw_concept) - - return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) - -Run model -~~~~~~~~~ - - - -Please select inference device using dropdown widget: - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - import gc - import openvino as ov - - core = ov.Core() - ov_dit = core.read_model(OV_DIT_MODEL) - dit = core.compile_model(ov_dit, device.value) - ov_text_encoder = core.read_model(OV_TEXT_ENCODER) - text_encoder = core.compile_model(ov_text_encoder, device.value) - ov_text_embedder = core.read_model(OV_TEXT_EMBEDDER) - - text_embedder = core.compile_model(ov_text_embedder, device.value) - vae_decoder = core.compile_model(OV_VAE_DECODER, device.value) - - del ov_dit, ov_text_encoder, ov_text_embedder - - gc.collect(); - -.. code:: ipython3 - - from transformers import AutoTokenizer - - tokenizer = AutoTokenizer.from_pretrained("./ckpts/t2i/tokenizer/") - embedder_tokenizer = AutoTokenizer.from_pretrained("./ckpts/t2i/mt5") - -.. code:: ipython3 - - from hydit.constants import SAMPLER_FACTORY, NEGATIVE_PROMPT - -.. code:: ipython3 - - sampler = "ddpm" - kwargs = SAMPLER_FACTORY[sampler]["kwargs"] - scheduler = SAMPLER_FACTORY[sampler]["scheduler"] - -.. code:: ipython3 - - from diffusers import schedulers - - scheduler_class = getattr(schedulers, scheduler) - scheduler = scheduler_class(**kwargs) - -.. code:: ipython3 - - ov_pipe = OVHyDiTPipeline(vae_decoder, text_encoder, tokenizer, dit, scheduler, None, None, embedder_t5=text_embedder, embedder_tokenizer=embedder_tokenizer) - -.. code:: ipython3 - - from hydit.modules.posemb_layers import get_2d_rotary_pos_embed, get_fill_resize_and_crop - - - def calc_rope(height, width, patch_size=2, head_size=88): - th = height // 8 // patch_size - tw = width // 8 // patch_size - base_size = 512 // 8 // patch_size - start, stop = get_fill_resize_and_crop((th, tw), base_size) - sub_args = [start, stop, (th, tw)] - rope = get_2d_rotary_pos_embed(head_size, *sub_args) - return rope - -.. code:: ipython3 - - from hydit.utils.tools import set_seeds - - height, width = 880, 880 - style = torch.as_tensor([0, 0]) - target_height = int((height // 16) * 16) - target_width = int((width // 16) * 16) - - size_cond = [height, width, target_width, target_height, 0, 0] - image_meta_size = torch.as_tensor([size_cond] * 2) - freqs_cis_img_cache = {} - - if (target_height, target_width) not in freqs_cis_img_cache: - freqs_cis_img_cache[(target_height, target_width)] = calc_rope(target_height, target_width) - - freqs_cis_img = freqs_cis_img_cache[(target_height, target_width)] - images = ov_pipe( - prompt="cute cat", - negative_prompt=NEGATIVE_PROMPT, - height=target_height, - width=target_width, - num_inference_steps=10, - image_meta_size=image_meta_size, - style=style, - return_dict=False, - guidance_scale=7.5, - freqs_cis_img=freqs_cis_img, - generator=set_seeds(42), - ) - - - -.. parsed-literal:: - - 0%| | 0/10 [00:00`__. It uses the -MobileNet V2 model, trained on Cifar10 dataset. The code is designed to -be extendable to custom models and datasets. The tutorial uses OpenVINO -backend for performing model quantization in NNCF, if you interested how -to apply quantization on PyTorch model, please check this -`tutorial `__. - -This tutorial consists of the following steps: - -- Prepare the model for quantization. -- Define a data loading functionality. -- Perform quantization. -- Compare accuracy of the original and quantized models. -- Compare performance of the original and quantized models. -- Compare results on one picture. - - -**Table of contents:** - - -- `Prepare the Model <#prepare-the-model>`__ -- `Prepare Dataset <#prepare-dataset>`__ -- `Perform Quantization <#perform-quantization>`__ - - - `Create Dataset for Validation <#create-dataset-for-validation>`__ - -- `Run nncf.quantize for Getting an Optimized - Model <#run-nncf-quantize-for-getting-an-optimized-model>`__ -- `Serialize an OpenVINO IR model <#serialize-an-openvino-ir-model>`__ -- `Compare Accuracy of the Original and Quantized - Models <#compare-accuracy-of-the-original-and-quantized-models>`__ - - - `Select inference device <#select-inference-device>`__ - -- `Compare Performance of the Original and Quantized - Models <#compare-performance-of-the-original-and-quantized-models>`__ -- `Compare results on four - pictures <#compare-results-on-four-pictures>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - # Install required packages - %pip install -q "openvino>=2023.1.0" "nncf>=2.6.0" torch torchvision tqdm "matplotlib>=3.4" --extra-index-url https://download.pytorch.org/whl/cpu - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - - -.. code:: ipython3 - - from pathlib import Path - - # Set the data and model directories - DATA_DIR = Path("data") - MODEL_DIR = Path("model") - - DATA_DIR.mkdir(exist_ok=True) - MODEL_DIR.mkdir(exist_ok=True) - -Prepare the Model ------------------ - - - -Model preparation stage has the following steps: - -- Download a PyTorch model -- Convert model to OpenVINO Intermediate Representation format (IR) - using model conversion Python API -- Serialize converted model on disk - -.. code:: ipython3 - - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("image-classification-quantization.ipynb") - -.. code:: ipython3 - - from cmd_helper import clone_repo - - - clone_repo("https://github.com/chenyaofo/pytorch-cifar-models.git") - - - - -.. parsed-literal:: - - PosixPath('pytorch-cifar-models') - - - -.. code:: ipython3 - - from pytorch_cifar_models import cifar10_mobilenetv2_x1_0 - - model = cifar10_mobilenetv2_x1_0(pretrained=True) - -OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate -Representation format using model conversion Python API. -``ov.convert_model`` accept PyTorch model instance and convert it into -``openvino.runtime.Model`` representation of model in OpenVINO. -Optionally, you may specify ``example_input`` which serves as a helper -for model tracing and ``input_shape`` for converting the model with -static shape. The converted model is ready to be loaded on a device for -inference and can be saved on a disk for next usage via the -``save_model`` function. More details about model conversion Python API -can be found on this -`page `__. - -.. code:: ipython3 - - import openvino as ov - - model.eval() - - ov_model = ov.convert_model(model, input=[1, 3, 32, 32]) - - ov.save_model(ov_model, MODEL_DIR / "mobilenet_v2.xml") - -Prepare Dataset ---------------- - - - -We will use `CIFAR10 `__ -dataset from -`torchvision `__. -Preprocessing for model obtained from training -`config `__ - -.. code:: ipython3 - - import torch - from torchvision import transforms - from torchvision.datasets import CIFAR10 - - transform = transforms.Compose( - [ - transforms.ToTensor(), - transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261)), - ] - ) - dataset = CIFAR10(root=DATA_DIR, train=False, transform=transform, download=True) - val_loader = torch.utils.data.DataLoader( - dataset, - batch_size=1, - shuffle=False, - num_workers=0, - pin_memory=True, - ) - - -.. parsed-literal:: - - Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to data/cifar-10-python.tar.gz - - -.. parsed-literal:: - - 100%|██████████| 170498071/170498071 [00:07<00:00, 23351071.97it/s] - - -.. parsed-literal:: - - Extracting data/cifar-10-python.tar.gz to data - - -Perform Quantization --------------------- - - - -`NNCF `__ provides a suite of -advanced algorithms for Neural Networks inference optimization in -OpenVINO with minimal accuracy drop. We will use 8-bit quantization in -post-training mode (without the fine-tuning pipeline) to optimize -MobileNetV2. The optimization process contains the following steps: - -1. Create a Dataset for quantization. -2. Run ``nncf.quantize`` for getting an optimized model. -3. Serialize an OpenVINO IR model, using the ``openvino.save_model`` - function. - -Create Dataset for Validation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -NNCF is compatible with ``torch.utils.data.DataLoader`` interface. For -performing quantization it should be passed into ``nncf.Dataset`` object -with transformation function, which prepares input data to fit into -model during quantization, in our case, to pick input tensor from pair -(input tensor and label) and convert PyTorch tensor to numpy. - -.. code:: ipython3 - - import nncf - - - def transform_fn(data_item): - image_tensor = data_item[0] - return image_tensor.numpy() - - - quantization_dataset = nncf.Dataset(val_loader, transform_fn) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -Run nncf.quantize for Getting an Optimized Model ------------------------------------------------- - - - -``nncf.quantize`` function accepts model and prepared quantization -dataset for performing basic quantization. Optionally, additional -parameters like ``subset_size``, ``preset``, ``ignored_scope`` can be -provided to improve quantization result if applicable. More details -about supported parameters can be found on this -`page `__ - -.. code:: ipython3 - - quant_ov_model = nncf.quantize(ov_model, quantization_dataset) - - -.. parsed-literal:: - - 2025-02-04 02:37:03.085758: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2025-02-04 02:37:03.118033: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2025-02-04 02:37:03.667541: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - - - -.. parsed-literal:: - - Output() - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - -Serialize an OpenVINO IR model ------------------------------- - - - -Similar to ``ov.convert_model``, quantized model is ``ov.Model`` object -which ready to be loaded into device and can be serialized on disk using -``ov.save_model``. - -.. code:: ipython3 - - ov.save_model(quant_ov_model, MODEL_DIR / "quantized_mobilenet_v2.xml") - -Compare Accuracy of the Original and Quantized Models ------------------------------------------------------ - - - -.. code:: ipython3 - - from tqdm.notebook import tqdm - import numpy as np - - - def test_accuracy(ov_model, data_loader): - correct = 0 - total = 0 - for batch_imgs, batch_labels in tqdm(data_loader): - result = ov_model(batch_imgs)[0] - top_label = np.argmax(result) - correct += top_label == batch_labels.numpy() - total += 1 - return correct / total - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - core = ov.Core() - compiled_model = core.compile_model(ov_model, device.value) - optimized_compiled_model = core.compile_model(quant_ov_model, device.value) - - orig_accuracy = test_accuracy(compiled_model, val_loader) - optimized_accuracy = test_accuracy(optimized_compiled_model, val_loader) - - - -.. parsed-literal:: - - 0%| | 0/10000 [00:00`__ -- an inference performance measurement tool in OpenVINO. - - **NOTE**: For more accurate performance, it is recommended to run - benchmark_app in a terminal/command prompt after closing other - applications. Run ``benchmark_app -m model.xml -d CPU`` to benchmark - async inference on CPU for one minute. Change CPU to GPU to benchmark - on GPU. Run ``benchmark_app --help`` to see an overview of all - command-line options. - -.. code:: ipython3 - - # Inference FP16 model (OpenVINO IR) - !benchmark_app -m "model/mobilenet_v2.xml" -d $device.value -api async -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 9.79 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,3,32,32] - [ INFO ] Model outputs: - [ INFO ] x.17 (node: aten::linear/Add) : f32 / [...] / [1,10] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,32,32] - [ INFO ] Model outputs: - [ INFO ] x.17 (node: aten::linear/Add) : f32 / [...] / [1,10] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 169.24 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model2 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 24 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model2 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 15000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 3.03 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 88152 iterations - [ INFO ] Duration: 15001.94 ms - [ INFO ] Latency: - [ INFO ] Median: 1.84 ms - [ INFO ] Average: 1.85 ms - [ INFO ] Min: 1.32 ms - [ INFO ] Max: 51.24 ms - [ INFO ] Throughput: 5876.04 FPS - - -.. code:: ipython3 - - # Inference INT8 model (OpenVINO IR) - !benchmark_app -m "model/quantized_mobilenet_v2.xml" -d $device.value -api async -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 15.37 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,3,32,32] - [ INFO ] Model outputs: - [ INFO ] x.17 (node: aten::linear/Add) : f32 / [...] / [1,10] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,32,32] - [ INFO ] Model outputs: - [ INFO ] x.17 (node: aten::linear/Add) : f32 / [...] / [1,10] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 264.72 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model2 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 24 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model2 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 15000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 1.89 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 167124 iterations - [ INFO ] Duration: 15001.02 ms - [ INFO ] Latency: - [ INFO ] Median: 1.00 ms - [ INFO ] Average: 1.03 ms - [ INFO ] Min: 0.69 ms - [ INFO ] Max: 23.83 ms - [ INFO ] Throughput: 11140.84 FPS - - -Compare results on four pictures --------------------------------- - - - -.. code:: ipython3 - - # Define all possible labels from the CIFAR10 dataset - labels_names = [ - "airplane", - "automobile", - "bird", - "cat", - "deer", - "dog", - "frog", - "horse", - "ship", - "truck", - ] - all_pictures = [] - all_labels = [] - - # Get all pictures and their labels. - for i, batch in enumerate(val_loader): - all_pictures.append(batch[0].numpy()) - all_labels.append(batch[1].item()) - -.. code:: ipython3 - - import matplotlib.pyplot as plt - - - def plot_pictures(indexes: list, all_pictures=all_pictures, all_labels=all_labels): - """Plot 4 pictures. - :param indexes: a list of indexes of pictures to be displayed. - :param all_batches: batches with pictures. - """ - images, labels = [], [] - num_pics = len(indexes) - assert num_pics == 4, f"No enough indexes for pictures to be displayed, got {num_pics}" - for idx in indexes: - assert idx < 10000, "Cannot get such index, there are only 10000" - pic = np.rollaxis(all_pictures[idx].squeeze(), 0, 3) - images.append(pic) - - labels.append(labels_names[all_labels[idx]]) - - f, axarr = plt.subplots(1, 4) - axarr[0].imshow(images[0]) - axarr[0].set_title(labels[0]) - - axarr[1].imshow(images[1]) - axarr[1].set_title(labels[1]) - - axarr[2].imshow(images[2]) - axarr[2].set_title(labels[2]) - - axarr[3].imshow(images[3]) - axarr[3].set_title(labels[3]) - -.. code:: ipython3 - - def infer_on_pictures(model, indexes: list, all_pictures=all_pictures): - """Inference model on a few pictures. - :param net: model on which do inference - :param indexes: list of indexes - """ - output_key = model.output(0) - predicted_labels = [] - for idx in indexes: - assert idx < 10000, "Cannot get such index, there are only 10000" - result = model(all_pictures[idx])[output_key] - result = labels_names[np.argmax(result[0])] - predicted_labels.append(result) - return predicted_labels - -.. code:: ipython3 - - indexes_to_infer = [7, 12, 15, 20] # To plot, specify 4 indexes. - - plot_pictures(indexes_to_infer) - - results_float = infer_on_pictures(compiled_model, indexes_to_infer) - results_quanized = infer_on_pictures(optimized_compiled_model, indexes_to_infer) - - print(f"Labels for picture from float model : {results_float}.") - print(f"Labels for picture from quantized model : {results_quanized}.") - - -.. parsed-literal:: - - Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). - Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). - Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). - Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). - - -.. parsed-literal:: - - Labels for picture from float model : ['frog', 'dog', 'ship', 'horse']. - Labels for picture from quantized model : ['frog', 'dog', 'ship', 'horse']. - - - -.. image:: image-classification-quantization-with-output_files/image-classification-quantization-with-output_31_2.png - diff --git a/docs/notebooks/image-classification-quantization-with-output_files/image-classification-quantization-with-output_31_2.png b/docs/notebooks/image-classification-quantization-with-output_files/image-classification-quantization-with-output_31_2.png deleted file mode 100644 index 62b64e695a3dbf..00000000000000 --- a/docs/notebooks/image-classification-quantization-with-output_files/image-classification-quantization-with-output_31_2.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9990baa1e4f121781deb249b7534c8f31f26ca9b37afd3f269ca50ec0318e7d4 -size 14855 diff --git a/docs/notebooks/image-to-image-genai-with-output.rst b/docs/notebooks/image-to-image-genai-with-output.rst deleted file mode 100644 index c99996dadabd75..00000000000000 --- a/docs/notebooks/image-to-image-genai-with-output.rst +++ /dev/null @@ -1,518 +0,0 @@ -Image-to-image generation using OpenVINO GenAI -============================================== - -Image-to-image is the task of transforming an input image through a -variety of possible manipulations and enhancements, such as -super-resolution, image inpainting, colorization, stylization and more. - -One of the most popular use cases of image-to-image is style transfer. -With style transfer models: \* a regular photo can be transformed into a -variety of artistic styles or genres, such as a watercolor painting, a -comic book illustration and more. \* new images can be generated using a -text prompt, in the style of a reference input image. - -Latent diffusion models can be used for performing image-to-image -generation. Diffusion-based Image-to-image is similar to -`text-to-image `__, -but in addition to a prompt, you can also pass an initial image as a -starting point for the diffusion process. The initial image is encoded -to latent space and noise is added to it. Then the latent diffusion -model takes a prompt and the noisy latent image, predicts the added -noise, and removes the predicted noise from the initial latent image to -get the new latent image. Lastly, a decoder decodes the new latent image -back into an image. - -.. figure:: https://user-images.githubusercontent.com/29454499/260981188-c112dd0a-5752-4515-adca-8b09bea5d14a.png - :alt: pipe.png - - pipe.png - -In this tutorial, we consider how to use OpenVINO GenAI for performing -image-to-image generation. - -About OpenVINO GenAI --------------------- - -`OpenVINO™ GenAI `__ -is a library of the most popular Generative AI model pipelines, -optimized execution methods, and samples that run on top of highly -performant OpenVINO Runtime. - -This library is friendly to PC and laptop execution, and optimized for -resource consumption. It requires no external dependencies to run -generative models as it already includes all the core functionality -(e.g. tokenization via openvino-tokenizers). - -OpenVINO GenAI supports popular diffusion models like Stable Diffusion -or SDXL for performing image generation. You can find supported models -list in `OpenVINO GenAI -documentation `__. -Previously, we considered how to run -`text-to-image `__ -generation with OpenVINO GenAI and `apply multiple LoRA -adapters `__, -now is image-to-image turn. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Prepare model <#prepare-model>`__ - - - `Export OpenVINO IR format model using the Hugging Face Optimum - library accelerated by OpenVINO - integration. <#export-openvino-ir-format-model-using-the-hugging-face-optimum-library-accelerated-by-openvino-integration->`__ - - `Use optimized models provided on HuggingFace - Hub <#use-optimized-models-provided-on-huggingface-hub>`__ - -- `Create inference pipeline <#create-inference-pipeline>`__ -- `Prepare inputs <#prepare-inputs>`__ -- `Run inference pipeline <#run-inference-pipeline>`__ -- `Configure pipeline parameters <#configure-pipeline-parameters>`__ - - - `Strength <#strength>`__ - - `Guidance scale <#guidance-scale>`__ - - `Negative prompt <#negative-prompt>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import platform - import requests - from pathlib import Path - - - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q -U --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly "openvino>=2024.5" "openvino-tokenizers>=2024.5" "openvino-genai>=2024.5" - %pip install -q Pillow "diffusers>=0.30.3" "gradio>=4.19" "typing_extensions>=4.9" "tqdm" huggingface-hub "nncf>=2.14.0" --extra-index-url https://download.pytorch.org/whl/cpu - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("image-to-image-genai.ipynb") - -Prepare model -------------- - - - -For starting working with OpenVINO GenAI pipelines, we should prepare -models for inference. OpenVINO GenAI image generation pipelines accepts -diffusers-compatible models converted to OpenVINO Intermediate -Representation format using Optimum Intel Interface. - -Export OpenVINO IR format model using the `Hugging Face Optimum `__ library accelerated by OpenVINO integration. -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -`Optimum Intel `__ -is the interface between the -`Transformers `__ and -`Diffusers `__ libraries -and OpenVINO to accelerate end-to-end pipelines on Intel architectures. -It provides ease-to-use cli interface for exporting models to `OpenVINO -Intermediate Representation -(IR) `__ -format. - -The command bellow demonstrates basic command for model export with -``optimum-cli`` - -.. code:: bash - - optimum-cli export openvino --model --task - -where ``--model`` argument is model id from HuggingFace Hub or local -directory with model (saved using ``.save_pretrained`` method), -``--task`` is one of `supported -task `__ -that exported model should solve. For image generation models, -``text-to-image`` or ``image-to-image`` should be used (as pipeline -components are the same, you can use converted models for both -text-to-image and image-to-image generation. There is no need to convert -models twice). If model initialization requires to use remote code, -``--trust-remote-code`` flag additionally should be passed. You can also -apply fp16, 8-bit or 4-bit weight compression on the Linear, -Convolutional and Embedding layers when exporting your model with the -CLI by setting ``--weight-format`` to respectively fp16, int8 or int4. -This type of optimization allows to reduce the memory footprint and -inference latency. - -We will use ``optimum_cli`` from our helper ``cmd_helper.py`` that is a -wrapper over cli-command. - -Use optimized models provided on HuggingFace Hub -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -For quick start, OpenVINO provides -`collection `__ -of optimized models that are ready to use with OpenVINO GenAI. You can -download them using following command: - -.. code:: bash - - huggingface-cli download --local-dir - -.. code:: ipython3 - - import ipywidgets as widgets - - use_preconverted = widgets.Checkbox(value=True, description="Use preconverted", disabled=False) - - use_preconverted - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Use preconverted') - - - -.. code:: ipython3 - - from pathlib import Path - - from cmd_helper import optimum_cli - - ov_model_id = "OpenVINO/stable-diffusion-v1-5-int8-ov" - hf_model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5" - model_dir = Path("stable-diffusion-v1-5-int8-ov") - - - if not model_dir.exists(): - if use_preconverted.value: - !huggingface-cli download {ov_model_id} --local-dir {model_dir} - else: - optimum_cli(hf_model_id, model_dir, additional_args={"weight-format": "int8"}) - -Create inference pipeline -------------------------- - - - -For creation Image-to-Image generation pipeline -``openvino_genai.Image2ImagePipeline`` should be used. The pipeline -accepts directory with converted model and inference device. Let’s -select execution device: - -.. code:: ipython3 - - from notebook_utils import device_widget - - - device = device_widget("CPU", exclude=["NPU", "AUTO"]) - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -.. code:: ipython3 - - import openvino_genai as ov_genai - - pipe = ov_genai.Image2ImagePipeline(model_dir, device.value) - -Prepare inputs --------------- - - - -The main inputs for image-to-image generation are input text prompt and -image. Input prompt is represented by text string that contains user’s -instructions for pipeline guidance. Input image should be provided as -``ov.Tensor``. The code bellow demonstrates how to convert image after -reading Pillow library to OpenVINO tensor. You also can pass some -additional advanced options for better controlling generation process. -We consider the most useful from them in next sections. - -.. code:: ipython3 - - import openvino as ov - import numpy as np - from PIL import Image - - - def image_to_tensor(image: Image) -> ov.Tensor: - pic = image.convert("RGB") - image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8) - return ov.Tensor(image_data) - -.. code:: ipython3 - - from PIL import Image - from diffusers.utils import load_image - - init_image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png" - init_image_path = Path("astronauts.png") - - if not init_image_path.exists(): - image = load_image(init_image_url) - image.save(init_image_path) - - init_image = Image.open(init_image_path) - init_image_tensor = image_to_tensor(init_image) - - init_image - - - - -.. image:: image-to-image-genai-with-output_files/image-to-image-genai-with-output_11_0.png - - - -Run inference pipeline ----------------------- - - - -For starting generation process, ``generate`` method should be used. -Together with input prompt and image, we also provided ``generator``, -pseudo-random numbers generator that responsible for results -reproducibility. We will use ``ov_genai.TorchGenerator`` helper, the -behavior of this generator is aligned with PyTorch. It means you can get -result as much close to original diffusion pipeline as possible. -``callback`` argument allows early stopping generation if we decided -that we are satisfied generation results when ``num_inference_steps`` -has not been not reached yet, but it also can be useful for some other -cases as well. In this tutorial, we will use it for adding interactive -progress bar. - -.. code:: ipython3 - - from tqdm.notebook import tqdm - import sys - - prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" - random_generator = ov_genai.TorchGenerator(42) - - pbar = tqdm(total=31) - - - def callback(step, num_steps, latent): - pbar.update(1) - sys.stdout.flush() - return False - - - image_tensor = pipe.generate(prompt, init_image_tensor, callback=callback, generator=random_generator, strength=0.6) - - pbar.close() - - out_image = Image.fromarray(image_tensor.data[0]) - out_image - - - -.. parsed-literal:: - - 0%| | 0/31 [00:00`__ -is a library of the most popular Generative AI model pipelines, -optimized execution methods, and samples that run on top of highly -performant OpenVINO Runtime. - -This library is friendly to PC and laptop execution, and optimized for -resource consumption. It requires no external dependencies to run -generative models as it already includes all the core functionality -(e.g. tokenization via openvino-tokenizers). - -OpenVINO GenAI supports popular diffusion models like Stable Diffusion -or SDXL for performing image generation. You can find supported models -list in `OpenVINO GenAI -documentation `__. -Previously, we considered how to run -`text-to-image `__ and -`image-to-image `__ -generation with OpenVINO GenAI and `apply multiple LoRA -adapters `__, -now is inpainting turn. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Prepare model <#prepare-model>`__ - - - `Export OpenVINO IR format model using the Hugging Face Optimum - library accelerated by OpenVINO - integration. <#export-openvino-ir-format-model-using-the-hugging-face-optimum-library-accelerated-by-openvino-integration->`__ - - `Use optimized models provided on HuggingFace - Hub <#use-optimized-models-provided-on-huggingface-hub>`__ - -- `Create inference pipeline <#create-inference-pipeline>`__ - - - `Prepare input <#prepare-input>`__ - - `Run inference pipeline <#run-inference-pipeline>`__ - -- `Mask blur <#mask-blur>`__ -- `Use non-inpaint specific - models <#use-non-inpaint-specific-models>`__ -- `Configure pipeline parameters <#configure-pipeline-parameters>`__ - - - `Strength <#strength>`__ - - `Guidance Scale <#guidance-scale>`__ - - `Negative prompt <#negative-prompt>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import platform - import requests - from pathlib import Path - - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q -U --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly "openvino>=2024.5" "openvino-tokenizers>=2024.5" "openvino-genai>=2024.5" - %pip install -q Pillow "diffusers>=0.30.3" "gradio>=4.19" "typing_extensions>=4.9" "tqdm" huggingface-hub "nncf>=2.14.0" --extra-index-url https://download.pytorch.org/whl/cpu - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("inpainting-genai.ipynb") - -Prepare model -------------- - - - -For starting working with OpenVINO GenAI pipelines, we should prepare -models for inference. OpenVINO GenAI image generation pipelines accepts -diffusers-compatible models converted to OpenVINO Intermediate -Representation format using Optimum Intel Interface. - -Export OpenVINO IR format model using the `Hugging Face Optimum `__ library accelerated by OpenVINO integration. -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -`Optimum Intel `__ -is the interface between the -`Transformers `__ and -`Diffusers `__ libraries -and OpenVINO to accelerate end-to-end pipelines on Intel architectures. -It provides ease-to-use cli interface for exporting models to `OpenVINO -Intermediate Representation -(IR) `__ -format. - -The command bellow demonstrates basic command for model export with -``optimum-cli`` - -.. code:: bash - - optimum-cli export openvino --model --task - -where ``--model`` argument is model id from HuggingFace Hub or local -directory with model (saved using ``.save_pretrained`` method), -``--task`` is one of `supported -task `__ -that exported model should solve. For inpainting models, ``inpainting`` -should be used. If model initialization requires to use remote code, -``--trust-remote-code`` flag additionally should be passed. You can also -apply fp16, 8-bit or 4-bit weight compression on the Linear, -Convolutional and Embedding layers when exporting your model with the -CLI by setting ``--weight-format`` to respectively fp16, int8 or int4. -This type of optimization allows to reduce the memory footprint and -inference latency. - -We will use ``optimum_cli`` from our helper ``cmd_helper.py`` that is a -wrapper over cli-command. - -Use optimized models provided on HuggingFace Hub -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -For quick start, OpenVINO provides -`collection `__ -of optimized models that are ready to use with OpenVINO GenAI. You can -download them using following command: - -.. code:: bash - - huggingface-cli download --local-dir - -.. code:: ipython3 - - from cmd_helper import optimum_cli - - inpaint_model_id = "stable-diffusion-v1-5/stable-diffusion-inpainting" - - inpaint_model_dir = Path(inpaint_model_id.split("/")[-1] + "-int8-ov") - - if not inpaint_model_dir.exists(): - optimum_cli(inpaint_model_id, inpaint_model_dir, additional_args={"weight-format": "int8"}) - -Create inference pipeline -------------------------- - - - -For creation Inpainting pipeline ``openvino_genai.InpaintingPipeline`` -should be used. The pipeline accepts directory with converted model and -inference device. Let’s select execution device: - -.. code:: ipython3 - - from notebook_utils import device_widget - - - device = device_widget("CPU", exclude=["NPU", "AUTO"]) - device - - -.. parsed-literal:: - - :247: DeprecationWarning: The `openvino.runtime` module is deprecated and will be removed in the 2026.0 release. Please replace `openvino.runtime` with `openvino`. - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU',), value='CPU') - - - -.. code:: ipython3 - - import openvino_genai as ov_genai - - pipe = ov_genai.InpaintingPipeline(inpaint_model_dir, device.value) - - - -Prepare input -~~~~~~~~~~~~~ - - - -The main inputs for inpainting are input text prompt, image and mask for -editing region for image. Input prompt is represented by text string -that contains user’s instructions for pipeline guidance. Input image and -mask should be provided as ``ov.Tensor``. The code bellow demonstrates -how to convert image after reading Pillow library to OpenVINO tensor. -You also can pass some additional advanced options for better -controlling generation process. We consider the most useful from them in -next sections. - -.. code:: ipython3 - - from diffusers.utils import load_image, make_image_grid - - image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png" - mask_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png" - - init_image_path = Path("inpaint.png") - mask_path = Path("inpaint_mask.png") - - if not init_image_path.exists(): - load_image(image_url).save(init_image_path) - - if not mask_path.exists(): - load_image(mask_url).save(mask_path) - -.. code:: ipython3 - - import openvino as ov - import numpy as np - from PIL import Image - - - def image_to_tensor(image: Image) -> ov.Tensor: - pic = image.convert("RGB") - image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8) - return ov.Tensor(image_data) - - - init_image = Image.open(init_image_path) - init_image_tensor = image_to_tensor(init_image) - - mask_image = Image.open(mask_path) - mask_image_tensor = image_to_tensor(mask_image) - - make_image_grid([init_image, mask_image], rows=1, cols=2) - - - - -.. image:: inpainting-genai-with-output_files/inpainting-genai-with-output_10_0.png - - - -Run inference pipeline -~~~~~~~~~~~~~~~~~~~~~~ - - - -For starting generation process, ``generate`` method should be used. -Together with input prompt, mask and image, we also provided -``generator``, pseudo-random numbers generator that responsible for -results reproducibility. We will use ``ov_genai.TorchGenerator`` helper, -the behavior of this generator is aligned with PyTorch. It means you can -get result as much close to original diffusion pipeline as possible. -``callback`` argument allows early stopping generation if we decided -that we are satisfied generation results when ``num_inference_steps`` -has not been not reached yet, but it also can be useful for some other -cases as well. In this tutorial, we will use it for adding interactive -progress bar. - -.. code:: ipython3 - - from tqdm.notebook import tqdm - import sys - - generator = ov_genai.TorchGenerator(92) - prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" - - pbar = tqdm() - - - def callback(step, num_steps, latent): - if pbar.total is None: - pbar.reset(num_steps) - pbar.update(1) - sys.stdout.flush() - return False - - - image_tensor = pipe.generate(prompt, init_image_tensor, mask_image_tensor, generator=generator, callback=callback, num_inference_steps=20) - - pbar.close() - out_image = Image.fromarray(image_tensor.data[0]) - - make_image_grid([init_image, mask_image, out_image], rows=1, cols=3) - - - -.. parsed-literal:: - - 0it [00:00, ?it/s] - - - - -.. image:: inpainting-genai-with-output_files/inpainting-genai-with-output_12_1.png - - - -Mask blur ---------- - - - -Mask blurring is useful technique for improving integration of modified -area with original image. The code bellow demonstrates how to obtain -mask with `Gaussian -Blur `__ and apply it for -generation. - -.. code:: ipython3 - - from PIL import ImageFilter - - - def blur(mask: Image, blur_factor: int = 4) -> Image: - return mask.filter(ImageFilter.GaussianBlur(blur_factor)) - - - blurred_mask_image = blur(mask_image, 4) - - blurred_mask_tensor = image_to_tensor(blurred_mask_image) - - make_image_grid([mask_image, blurred_mask_image], rows=1, cols=2) - - - - -.. image:: inpainting-genai-with-output_files/inpainting-genai-with-output_14_0.png - - - -.. code:: ipython3 - - generator = ov_genai.TorchGenerator(92) - - pbar = tqdm() - - image_tensor = pipe.generate(prompt, init_image_tensor, blurred_mask_tensor, generator=generator, callback=callback, num_inference_steps=20) - - pbar.close() - out_image = Image.fromarray(image_tensor.data[0]) - - make_image_grid([init_image, blurred_mask_image, out_image], rows=1, cols=3) - - - -.. parsed-literal:: - - 0it [00:00, ?it/s] - - - - -.. image:: inpainting-genai-with-output_files/inpainting-genai-with-output_15_1.png - - - -Use non-inpaint specific models -------------------------------- - - - -In this tutorial, we use -`stable-diffusion-v1-5/stable-diffusion-inpainting `__ -model, fine-tuned for inpainting Stable Diffusion 1.5 version, but -``InpaingpingPipeline`` also supports loading regular text-to-image -models. Let’s compare the results of the two checkpoints. - -.. code:: ipython3 - - ov_model_id = "OpenVINO/stable-diffusion-v1-5-int8-ov" - hf_model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5" - model_dir = Path("stable-diffusion-v1-5-int8-ov") - - if not model_dir.exists(): - !huggingface-cli download {ov_model_id} --local-dir {model_dir} - -.. code:: ipython3 - - pipe = ov_genai.InpaintingPipeline(model_dir, device.value) - generator = ov_genai.TorchGenerator(92) - prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" - - pbar = tqdm() - - image_tensor = pipe.generate(prompt, init_image_tensor, mask_image_tensor, generator=generator, callback=callback, num_inference_steps=20) - - pbar.close() - out_image = Image.fromarray(image_tensor.data[0]) - - make_image_grid([init_image, mask_image, out_image], rows=1, cols=3) - - - - - - - - - -.. parsed-literal:: - - 0it [00:00, ?it/s] - - - - -.. image:: inpainting-genai-with-output_files/inpainting-genai-with-output_18_2.png - - - -However, for more basic tasks like erasing an object from an image (like -the rocks in the road for example), a regular checkpoint yields pretty -good results. There isn’t as noticeable of difference between the -regular and inpaint checkpoint. - -.. code:: ipython3 - - road_mask_path = Path("road_mask.png") - - if not road_mask_path.exists(): - road_mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/road-mask.png") - road_mask_image.save(road_mask_path) - - road_mask_tensor = image_to_tensor(Image.open(road_mask_path)) - - generator = ov_genai.TorchGenerator(42) - -.. code:: ipython3 - - pbar = tqdm() - - image_tensor = pipe.generate("road", init_image_tensor, road_mask_tensor, generator=generator, callback=callback, num_inference_steps=20) - - pbar.close() - out_image = Image.fromarray(image_tensor.data[0]) - - make_image_grid([init_image, out_image], rows=1, cols=2) - - - -.. parsed-literal:: - - 0it [00:00, ?it/s] - - - - -.. image:: inpainting-genai-with-output_files/inpainting-genai-with-output_21_1.png - - - -Configure pipeline parameters ------------------------------ - - - -Image features - like quality and “creativity” - are dependent on -pipeline parameters. Knowing what these parameters do is important for -getting the results you want. Let’s take a look at the most important -parameters and see how changing them affects the output. - -Strength -~~~~~~~~ - - - -``strength`` is a measure of how much noise is added to the base image, -which influences how similar the output is to the base image. - -- 📈 a high strength value means more noise is added to an image and - the denoising process takes longer, but you’ll get higher quality - images that are more different from the base image -- 📉 a low strength value means less noise is added to an image and the - denoising process is faster, but the image quality may not be as - great and the generated image resembles the base image more - -.. image:: https://github.com/user-attachments/assets/745c849d-b7a0-4881-87d5-a4f3d450479d - -Guidance Scale -~~~~~~~~~~~~~~ - - - -``guidance_scale`` affects how aligned the text prompt and generated -image are. - -- 📈 a high guidance_scale value means the prompt and generated image - are closely aligned, so the output is a stricter interpretation of - the prompt -- 📉 a low guidance_scale value means the prompt and generated image - are more loosely aligned, so the output may be more varied from the - prompt - -.. image:: https://github.com/user-attachments/assets/1249e875-5e51-4309-8ba4-1d9b609e0976 - -You can use ``strength`` and ``guidance_scale`` together for more -control over how expressive the model is. For example, a combination -high ``strength`` and ``guidance_scale`` values gives the model the most -creative freedom. - -Negative prompt -~~~~~~~~~~~~~~~ - - - -A negative prompt assumes the opposite role of a prompt; it guides the -model away from generating certain things in an image. This is useful -for quickly improving image quality and preventing the model from -generating things you don’t want. - -Interactive demo ----------------- - - - -Now, you can try inpainting capabilities on own images. Upload image or -use one from provided examples, provide input text prompt and use brush -button for starting drawing mask on top of input image. For start -generation, please click ``Inpaint`` button. You also can adjust -generation controlling parameters like ``guidance_scale``, ``streight``, -``negative_prompt``, ``num_inference_steps``, ``seed`` and mask -processing using ``Advanced Parameters`` panel. - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get() - with open("gradio_helper.py") as f: - f.write(r.text) - - from gradio_helper import make_demo - - pipe = ov_genai.InpaintingPipeline(inpaint_model_dir, device.value) - - - demo = make_demo(pipe) - - if __name__ == "__main__": - demo.launch(debug=True) diff --git a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_10_0.jpg b/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_10_0.jpg deleted file mode 100644 index f233a056b0275f..00000000000000 --- a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_10_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:53ca6b56e6c83dce33dc2a51af9b556559cc093ca3aab6ee5020b303edbe1b38 -size 57913 diff --git a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_10_0.png b/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_10_0.png deleted file mode 100644 index 18098cf654834b..00000000000000 --- a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_10_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:af27412b57291058040917fd2a293ec533cbc05696c3c29c43dcbb4853b4b564 -size 357956 diff --git a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_12_1.jpg b/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_12_1.jpg deleted file mode 100644 index a5544cfeba0ac2..00000000000000 --- a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_12_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:37ed7438c670ae90b849164014dbaaad830d3a565cdbd103dafe9a87bf40ba43 -size 110061 diff --git a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_12_1.png b/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_12_1.png deleted file mode 100644 index ee3f09e38d1e3f..00000000000000 --- a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_12_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:228353861dbae34c4330dcf666554a52aa961e9ccd5f10a2a27d79fd43e358a6 -size 874407 diff --git a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_14_0.jpg b/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_14_0.jpg deleted file mode 100644 index f47e6887fa09a6..00000000000000 --- a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_14_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:91bc55aa48fc92418593e953cd0774170861d251a45bf402cd2f93a1a21a7c4a -size 14471 diff --git a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_14_0.png b/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_14_0.png deleted file mode 100644 index 8ebfe3f00d2e21..00000000000000 --- a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_14_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5a5f13e457347577da355ba48b5450d7d32c812501b93c0df13af0d12e4d991f -size 26862 diff --git a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_15_1.jpg b/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_15_1.jpg deleted file mode 100644 index f3f46021ca1f31..00000000000000 --- a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_15_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:53852f16a531a8c0191043c5af0c4a5f1d61550c86272e9d29e2b58562f46e95 -size 107971 diff --git a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_15_1.png b/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_15_1.png deleted file mode 100644 index 3e972f2c6d9d81..00000000000000 --- a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_15_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:17a1f4b1b64c0da7a0da62c4717dceb563114c2e6caf8510e5b57f97710f9be7 -size 889069 diff --git a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_18_2.jpg b/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_18_2.jpg deleted file mode 100644 index 441f605f45a94a..00000000000000 --- a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_18_2.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cecc2d3f504877791f698e60f677710cc2ca48e27a6cf967087e9cc2945a59ef -size 108533 diff --git a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_18_2.png b/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_18_2.png deleted file mode 100644 index 4b550db716622f..00000000000000 --- a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_18_2.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4ddfe98be9329e515c25a501ad59597b8d897e36a89e80ac0fe723fbce53dce3 -size 867425 diff --git a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_21_1.jpg b/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_21_1.jpg deleted file mode 100644 index f53d94d87662b7..00000000000000 --- a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_21_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:97527fde0f5b3d6163ccbbe4b1965917d5d7d8c2e6bb6b52ec8e5dc94621bec6 -size 97584 diff --git a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_21_1.png b/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_21_1.png deleted file mode 100644 index e696105973cc61..00000000000000 --- a/docs/notebooks/inpainting-genai-with-output_files/inpainting-genai-with-output_21_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:56c8efc65649b7d0defb38a91a3005b7ba02362f07d62125726b4294625f8030 -size 823767 diff --git a/docs/notebooks/instant-id-with-output.rst b/docs/notebooks/instant-id-with-output.rst deleted file mode 100644 index 1f9d23f5b0ed55..00000000000000 --- a/docs/notebooks/instant-id-with-output.rst +++ /dev/null @@ -1,4391 +0,0 @@ -InstantID: Zero-shot Identity-Preserving Generation using OpenVINO -================================================================== - -Nowadays has been significant progress in personalized image synthesis -with methods such as Textual Inversion, DreamBooth, and LoRA. However, -their real-world applicability is hindered by high storage demands, -lengthy fine-tuning processes, and the need for multiple reference -images. Conversely, existing ID embedding-based methods, while requiring -only a single forward inference, face challenges: they either -necessitate extensive fine-tuning across numerous model parameters, lack -compatibility with community pre-trained models, or fail to maintain -high face fidelity. - -`InstantID `__ is a tuning-free method to -achieve ID-Preserving generation with only single image, supporting -various downstream tasks. |applications.png| - -Given only one reference ID image, InstantID aims to generate customized -images with various poses or styles from a single reference ID image -while ensuring high fidelity. Following figure provides an overview of -the method. It incorporates three crucial components: - -1. An ID embedding that captures robust semantic face information; -2. A lightweight adapted module with decoupled cross-attention, - facilitating the use of an image as a visual prompt; -3. An IdentityNet that encodes the detailed features from the reference - facial image with additional spatial control. - -.. figure:: https://instantid.github.io/static/documents/pipeline.png - :alt: instantid-components.png - - instantid-components.png - -The difference InstantID from previous works in the following aspects: -1. do not involve UNet training, so it can preserve the generation -ability of the original text-to-image model and be compatible with -existing pre-trained models and ControlNets in the community; 2. doesn’t -require test-time tuning, so for a specific character, there is no need -to collect multiple images for fine-tuning, only a single image needs to -be inferred once; 3. achieve better face fidelity, and retain the -editability of text. - -You can find more details about the approach with `project web -page `__, -`paper `__ and `original -repository `__ - -In this tutorial, we consider how to use InstantID with OpenVINO. An -additional part demonstrates how to run optimization with -`NNCF `__ to speed up -pipeline. - -**Table of contents:** - -- `Prerequisites <#prerequisites>`__ -- `Convert and prepare Face IdentityNet <#convert-and-prepare-face-identitynet>`__ - - - `Select Inference Device for Face Recognition <#select-inference-device-for-face-recognition>`__ - - `Perform Face Identity extraction <#perform-face-identity-extraction>`__ - -- `Prepare InstantID pipeline <#prepare-instantid-pipeline>`__ -- `Convert InstantID pipeline components to OpenVINO Intermediate Representation format <#convert-instantid-pipeline-components-to-openvino-intermediate-representation-format>`__ - - - `ControlNet <#controlnet>`__ - - `Unet <#unet>`__ - - `VAE Decoder <#vae-decoder>`__ - - `Text Encoders <#text-encoders>`__ - - `Image Projection Model <#image-projection-model>`__ - -- `Prepare OpenVINO InstantID Pipeline <#prepare-openvino-instantid-pipeline>`__ -- `Run OpenVINO pipeline inference <#run-openvino-pipeline-inference>`__ - - - `Select inference device for InstantID <#select-inference-device-for-instantid>`__ - - `Create pipeline <#create-pipeline>`__ - - `Run inference <#run-inference>`__ - -- `Quantization <#quantization>`__ - - - `Prepare calibration datasets <#prepare-calibration-datasets>`__ - - `Run quantization <#run-quantization>`__ - - - `Run ControlNet Quantization <#run-controlnet-quantization>`__ - - `Run UNet Hybrid Quantization <#run-unet-hybrid-quantization>`__ - - `Run Weights Compression <#run-weights-compression>`__ - - - `Compare model file sizes <#compare-model-file-sizes>`__ - - `Compare inference time of the FP16 and INT8 pipelines <#compare-inference-time-of-the-fp16-and-int8-pipelines>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. |applications.png| image:: https://github.com/InstantID/InstantID/blob/main/assets/applications.png?raw=true - -Prerequisites -------------- - - - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("instant-id.ipynb") - -.. code:: ipython3 - - from cmd_helper import clone_repo - - - clone_repo("https://github.com/InstantID/InstantID.git") - -.. code:: ipython3 - - %pip install -q "openvino>=2023.3.0" opencv-python transformers "diffusers>=0.24.0" "matplotlib>=3.4" accelerate gdown "scikit-image>=0.19.2" "gradio>=4.19" "nncf>=2.9.0" "datasets>=2.14.6" "peft>=0.6.2" - -Convert and prepare Face IdentityNet ------------------------------------- - - - -For getting face embeddings and pose key points, InstantID uses -`InsightFace `__ face -analytic library. Its models are distributed in ONNX format and can be -run with OpenVINO. For preparing the face image, we need to detect the -bounding boxes and keypoints for the face using the RetinaFace model, -crop the detected face, align the face location using landmarks, and -provide each face into the Arcface face embedding model for getting the -person’s identity embeddings. - -The code below downloads the InsightFace Antelopev2 model kit and -provides a simple interface compatible with InsightFace for getting face -recognition results. - -.. code:: ipython3 - - from pathlib import Path - - - MODELS_DIR = Path("models") - face_detector_path = MODELS_DIR / "antelopev2" / "scrfd_10g_bnkps.onnx" - face_embeddings_path = MODELS_DIR / "antelopev2" / "glintr100.onnx" - -.. code:: ipython3 - - from zipfile import ZipFile - import gdown - - archive_file = Path("antelopev2.zip") - - if not face_detector_path.exists() or face_embeddings_path.exists(): - if not archive_file.exists(): - gdown.download( - "https://drive.google.com/uc?id=18wEUfMNohBJ4K3Ly5wpTejPfDzp-8fI8", - str(archive_file), - ) - with ZipFile(archive_file, "r") as zip_face_models: - zip_face_models.extractall(MODELS_DIR) - -.. code:: ipython3 - - import cv2 - import numpy as np - from skimage import transform as trans - - - def softmax(z): - assert len(z.shape) == 2 - s = np.max(z, axis=1) - s = s[:, np.newaxis] # necessary step to do broadcasting - e_x = np.exp(z - s) - div = np.sum(e_x, axis=1) - div = div[:, np.newaxis] # dito - return e_x / div - - - def distance2bbox(points, distance, max_shape=None): - """Decode distance prediction to bounding box. - - Args: - points (Tensor): Shape (n, 2), [x, y]. - distance (Tensor): Distance from the given point to 4 - boundaries (left, top, right, bottom). - max_shape (tuple): Shape of the image. - - Returns: - Tensor: Decoded bboxes. - """ - x1 = points[:, 0] - distance[:, 0] - y1 = points[:, 1] - distance[:, 1] - x2 = points[:, 0] + distance[:, 2] - y2 = points[:, 1] + distance[:, 3] - if max_shape is not None: - x1 = x1.clamp(min=0, max=max_shape[1]) - y1 = y1.clamp(min=0, max=max_shape[0]) - x2 = x2.clamp(min=0, max=max_shape[1]) - y2 = y2.clamp(min=0, max=max_shape[0]) - return np.stack([x1, y1, x2, y2], axis=-1) - - - def distance2kps(points, distance, max_shape=None): - """Decode distance prediction to bounding box. - - Args: - points (Tensor): Shape (n, 2), [x, y]. - distance (Tensor): Distance from the given point to 4 - boundaries (left, top, right, bottom). - max_shape (tuple): Shape of the image. - - Returns: - Tensor: Decoded bboxes. - """ - preds = [] - for i in range(0, distance.shape[1], 2): - px = points[:, i % 2] + distance[:, i] - py = points[:, i % 2 + 1] + distance[:, i + 1] - if max_shape is not None: - px = px.clamp(min=0, max=max_shape[1]) - py = py.clamp(min=0, max=max_shape[0]) - preds.append(px) - preds.append(py) - return np.stack(preds, axis=-1) - - - def prepare_input(image, std, mean, reverse_channels=True): - normalized_image = (image.astype(np.float32) - mean) / std - if reverse_channels: - normalized_image = normalized_image[:, :, ::-1] - input_tensor = np.expand_dims(np.transpose(normalized_image, (2, 0, 1)), 0) - return input_tensor - - - class RetinaFace: - def __init__(self, ov_model): - self.taskname = "detection" - self.ov_model = ov_model - self.center_cache = {} - self.nms_thresh = 0.4 - self.det_thresh = 0.5 - self._init_vars() - - def _init_vars(self): - self.input_size = (640, 640) - outputs = self.ov_model.outputs - self.input_mean = 127.5 - self.input_std = 128.0 - self.use_kps = False - self._anchor_ratio = 1.0 - self._num_anchors = 1 - if len(outputs) == 6: - self.fmc = 3 - self._feat_stride_fpn = [8, 16, 32] - self._num_anchors = 2 - elif len(outputs) == 9: - self.fmc = 3 - self._feat_stride_fpn = [8, 16, 32] - self._num_anchors = 2 - self.use_kps = True - elif len(outputs) == 10: - self.fmc = 5 - self._feat_stride_fpn = [8, 16, 32, 64, 128] - self._num_anchors = 1 - elif len(outputs) == 15: - self.fmc = 5 - self._feat_stride_fpn = [8, 16, 32, 64, 128] - self._num_anchors = 1 - self.use_kps = True - - def prepare(self, **kwargs): - nms_thresh = kwargs.get("nms_thresh", None) - if nms_thresh is not None: - self.nms_thresh = nms_thresh - det_thresh = kwargs.get("det_thresh", None) - if det_thresh is not None: - self.det_thresh = det_thresh - input_size = kwargs.get("input_size", None) - if input_size is not None: - if self.input_size is not None: - print("warning: det_size is already set in detection model, ignore") - else: - self.input_size = input_size - - def forward(self, img, threshold): - scores_list = [] - bboxes_list = [] - kpss_list = [] - blob = prepare_input(img, self.input_mean, self.input_std, True) - net_outs = self.ov_model(blob) - - input_height = blob.shape[2] - input_width = blob.shape[3] - fmc = self.fmc - for idx, stride in enumerate(self._feat_stride_fpn): - scores = net_outs[idx] - bbox_preds = net_outs[idx + fmc] - bbox_preds = bbox_preds * stride - if self.use_kps: - kps_preds = net_outs[idx + fmc * 2] * stride - height = input_height // stride - width = input_width // stride - key = (height, width, stride) - if key in self.center_cache: - anchor_centers = self.center_cache[key] - else: - anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32) - anchor_centers = (anchor_centers * stride).reshape((-1, 2)) - if self._num_anchors > 1: - anchor_centers = np.stack([anchor_centers] * self._num_anchors, axis=1).reshape((-1, 2)) - if len(self.center_cache) < 100: - self.center_cache[key] = anchor_centers - - pos_inds = np.where(scores >= threshold)[0] - bboxes = distance2bbox(anchor_centers, bbox_preds) - pos_scores = scores[pos_inds] - pos_bboxes = bboxes[pos_inds] - scores_list.append(pos_scores) - bboxes_list.append(pos_bboxes) - if self.use_kps: - kpss = distance2kps(anchor_centers, kps_preds) - # kpss = kps_preds - kpss = kpss.reshape((kpss.shape[0], -1, 2)) - pos_kpss = kpss[pos_inds] - kpss_list.append(pos_kpss) - return scores_list, bboxes_list, kpss_list - - def detect(self, img, input_size=None, max_num=0, metric="default"): - assert input_size is not None or self.input_size is not None - input_size = self.input_size if input_size is None else input_size - - im_ratio = float(img.shape[0]) / img.shape[1] - model_ratio = float(input_size[1]) / input_size[0] - if im_ratio > model_ratio: - new_height = input_size[1] - new_width = int(new_height / im_ratio) - else: - new_width = input_size[0] - new_height = int(new_width * im_ratio) - det_scale = float(new_height) / img.shape[0] - resized_img = cv2.resize(img, (new_width, new_height)) - det_img = np.zeros((input_size[1], input_size[0], 3), dtype=np.uint8) - det_img[:new_height, :new_width, :] = resized_img - - scores_list, bboxes_list, kpss_list = self.forward(det_img, self.det_thresh) - - scores = np.vstack(scores_list) - scores_ravel = scores.ravel() - order = scores_ravel.argsort()[::-1] - bboxes = np.vstack(bboxes_list) / det_scale - if self.use_kps: - kpss = np.vstack(kpss_list) / det_scale - pre_det = np.hstack((bboxes, scores)).astype(np.float32, copy=False) - pre_det = pre_det[order, :] - keep = self.nms(pre_det) - det = pre_det[keep, :] - if self.use_kps: - kpss = kpss[order, :, :] - kpss = kpss[keep, :, :] - else: - kpss = None - if max_num > 0 and det.shape[0] > max_num: - area = (det[:, 2] - det[:, 0]) * (det[:, 3] - det[:, 1]) - img_center = img.shape[0] // 2, img.shape[1] // 2 - offsets = np.vstack( - [ - (det[:, 0] + det[:, 2]) / 2 - img_center[1], - (det[:, 1] + det[:, 3]) / 2 - img_center[0], - ] - ) - offset_dist_squared = np.sum(np.power(offsets, 2.0), 0) - if metric == "max": - values = area - else: - values = area - offset_dist_squared * 2.0 # some extra weight on the centering - bindex = np.argsort(values)[::-1] # some extra weight on the centering - bindex = bindex[0:max_num] - det = det[bindex, :] - if kpss is not None: - kpss = kpss[bindex, :] - return det, kpss - - def nms(self, dets): - thresh = self.nms_thresh - x1 = dets[:, 0] - y1 = dets[:, 1] - x2 = dets[:, 2] - y2 = dets[:, 3] - scores = dets[:, 4] - - areas = (x2 - x1 + 1) * (y2 - y1 + 1) - order = scores.argsort()[::-1] - - keep = [] - while order.size > 0: - i = order[0] - keep.append(i) - xx1 = np.maximum(x1[i], x1[order[1:]]) - yy1 = np.maximum(y1[i], y1[order[1:]]) - xx2 = np.minimum(x2[i], x2[order[1:]]) - yy2 = np.minimum(y2[i], y2[order[1:]]) - - w = np.maximum(0.0, xx2 - xx1 + 1) - h = np.maximum(0.0, yy2 - yy1 + 1) - inter = w * h - ovr = inter / (areas[i] + areas[order[1:]] - inter) - - inds = np.where(ovr <= thresh)[0] - order = order[inds + 1] - - return keep - - - arcface_dst = np.array( - [ - [38.2946, 51.6963], - [73.5318, 51.5014], - [56.0252, 71.7366], - [41.5493, 92.3655], - [70.7299, 92.2041], - ], - dtype=np.float32, - ) - - - def estimate_norm(lmk, image_size=112, mode="arcface"): - assert lmk.shape == (5, 2) - assert image_size % 112 == 0 or image_size % 128 == 0 - if image_size % 112 == 0: - ratio = float(image_size) / 112.0 - diff_x = 0 - else: - ratio = float(image_size) / 128.0 - diff_x = 8.0 * ratio - dst = arcface_dst * ratio - dst[:, 0] += diff_x - tform = trans.SimilarityTransform() - tform.estimate(lmk, dst) - M = tform.params[0:2, :] - return M - - - def norm_crop(img, landmark, image_size=112, mode="arcface"): - M = estimate_norm(landmark, image_size, mode) - warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0) - return warped - - - class FaceEmbeddings: - def __init__(self, ov_model): - self.ov_model = ov_model - self.taskname = "recognition" - input_mean = 127.5 - input_std = 127.5 - self.input_mean = input_mean - self.input_std = input_std - input_shape = self.ov_model.inputs[0].partial_shape - self.input_size = (input_shape[3].get_length(), input_shape[2].get_length()) - self.input_shape = input_shape - - def get(self, img, kps): - aimg = norm_crop(img, landmark=kps, image_size=self.input_size[0]) - embedding = self.get_feat(aimg).flatten() - return embedding - - def get_feat(self, imgs): - if not isinstance(imgs, list): - imgs = [imgs] - input_size = self.input_size - blob = np.concatenate([prepare_input(cv2.resize(img, input_size), self.input_mean, self.input_std, True) for img in imgs]) - - net_out = self.ov_model(blob)[0] - return net_out - - def forward(self, batch_data): - blob = (batch_data - self.input_mean) / self.input_std - net_out = self.ov_model(blob)[0] - return net_out - - - class OVFaceAnalysis: - def __init__(self, detect_model, embedding_model): - self.det_model = RetinaFace(detect_model) - self.embed_model = FaceEmbeddings(embedding_model) - - def get(self, img, max_num=0): - bboxes, kpss = self.det_model.detect(img, max_num=max_num, metric="default") - if bboxes.shape[0] == 0: - return [] - ret = [] - for i in range(bboxes.shape[0]): - bbox = bboxes[i, 0:4] - det_score = bboxes[i, 4] - kps = None - if kpss is not None: - kps = kpss[i] - embedding = self.embed_model.get(img, kps) - ret.append({"bbox": bbox, "score": det_score, "kps": kps, "embedding": embedding}) - return ret - -Now, let’s see models inference result - -Select Inference Device for Face Recognition -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - import openvino as ov - - from notebook_utils import device_widget - - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - core = ov.Core() - face_detector = core.compile_model(face_detector_path, device.value) - face_embedding = core.compile_model(face_embeddings_path, device.value) - -.. code:: ipython3 - - app = OVFaceAnalysis(face_detector, face_embedding) - -Perform Face Identity extraction -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Now, we can apply our ``OVFaceAnalysis`` pipeline on an image for -collection face embeddings and key points for reflection on the -generated image - -.. code:: ipython3 - - import PIL.Image - from pipeline_stable_diffusion_xl_instantid import draw_kps - - - def get_face_info(face_image: PIL.Image.Image): - r""" - Retrieve face information from the input face image. - - Args: - face_image (PIL.Image.Image): - An image containing a face. - - Returns: - face_emb (numpy.ndarray): - Facial embedding extracted from the face image. - face_kps (PIL.Image.Image): - Facial keypoints drawn on the face image. - """ - face_image = face_image.resize((832, 800)) - # prepare face emb - face_info = app.get(cv2.cvtColor(np.array(face_image), cv2.COLOR_RGB2BGR)) - if len(face_info) == 0: - raise RuntimeError("Couldn't find the face on the image") - face_info = sorted( - face_info, - key=lambda x: (x["bbox"][2] - x["bbox"][0]) * x["bbox"][3] - x["bbox"][1], - )[ - -1 - ] # only use the maximum face - face_emb = face_info["embedding"] - face_kps = draw_kps(face_image, face_info["kps"]) - return face_emb, face_kps - -.. code:: ipython3 - - from diffusers.utils import load_image - - face_image = Path("vermeer.jpg") - - if not face_image.exists(): - face_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/vermeer.jpg") - - face_emb, face_kps = get_face_info(face_image) - -.. code:: ipython3 - - face_image - - - - -.. image:: instant-id-with-output_files/instant-id-with-output_16_0.png - - - -.. code:: ipython3 - - face_kps - - - - -.. image:: instant-id-with-output_files/instant-id-with-output_17_0.png - - - -Prepare InstantID pipeline --------------------------- - - - -The code below downloads InstantID pipeline parts - ControlNet for face -pose and IP-Adapter for adding face embeddings to prompt - -.. code:: ipython3 - - from huggingface_hub import hf_hub_download - - hf_hub_download( - repo_id="InstantX/InstantID", - filename="ControlNetModel/config.json", - local_dir="./checkpoints", - ) - hf_hub_download( - repo_id="InstantX/InstantID", - filename="ControlNetModel/diffusion_pytorch_model.safetensors", - local_dir="./checkpoints", - ) - hf_hub_download(repo_id="InstantX/InstantID", filename="ip-adapter.bin", local_dir="./checkpoints"); - -As it was discussed in model description, InstantID does not required -diffusion model fine-tuning and can be applied on existing Stable -Diffusion pipeline. We will use -`stable-diffusion-xl-bas-1-0 `__ -as basic text-to-image diffusion pipeline. We also apply `LCM -LoRA `__ to -speedup the generation process. Previously, we already considered how to -convert and run SDXL model for Text-to-Image and Image-to-Image -generation using Optimum-Intel library (please check out this notebook -for `details `__), now -we will use it in combination with ControlNet and convert it using -OpenVINO Model Conversion API. - -.. code:: ipython3 - - from diffusers.models import ControlNetModel - from diffusers import LCMScheduler - from pipeline_stable_diffusion_xl_instantid import StableDiffusionXLInstantIDPipeline - - import torch - from PIL import Image - import gc - - - ov_controlnet_path = MODELS_DIR / "controlnet.xml" - ov_unet_path = MODELS_DIR / "unet.xml" - ov_vae_decoder_path = MODELS_DIR / "vae_decoder.xml" - ov_text_encoder_path = MODELS_DIR / "text_encoder.xml" - ov_text_encoder_2_path = MODELS_DIR / "text_encoder_2.xml" - ov_image_proj_encoder_path = MODELS_DIR / "image_proj_model.xml" - - required_pipeline_parts = [ - ov_controlnet_path, - ov_unet_path, - ov_vae_decoder_path, - ov_text_encoder_path, - ov_text_encoder_2_path, - ov_image_proj_encoder_path, - ] - - - def load_pytorch_pipeline(sdxl_id="stabilityai/stable-diffusion-xl-base-1.0"): - # prepare models under ./checkpoints - face_adapter = Path("checkpoints/ip-adapter.bin") - controlnet_path = Path("checkpoints/ControlNetModel") - - # load IdentityNet - controlnet = ControlNetModel.from_pretrained(controlnet_path) - - pipe = StableDiffusionXLInstantIDPipeline.from_pretrained(sdxl_id, controlnet=controlnet) - - # load adapter - pipe.load_ip_adapter_instantid(face_adapter) - # load lcm lora - pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl") - pipe.fuse_lora() - scheduler = LCMScheduler.from_config(pipe.scheduler.config) - pipe.set_ip_adapter_scale(0.8) - - controlnet, unet, vae = pipe.controlnet, pipe.unet, pipe.vae - text_encoder, text_encoder_2, tokenizer, tokenizer_2 = ( - pipe.text_encoder, - pipe.text_encoder_2, - pipe.tokenizer, - pipe.tokenizer_2, - ) - image_proj_model = pipe.image_proj_model - return ( - controlnet, - unet, - vae, - text_encoder, - text_encoder_2, - tokenizer, - tokenizer_2, - image_proj_model, - scheduler, - ) - - - load_torch_models = any([not path.exists() for path in required_pipeline_parts]) - - if load_torch_models: - ( - controlnet, - unet, - vae, - text_encoder, - text_encoder_2, - tokenizer, - tokenizer_2, - image_proj_model, - scheduler, - ) = load_pytorch_pipeline() - tokenizer.save_pretrained(MODELS_DIR / "tokenizer") - tokenizer_2.save_pretrained(MODELS_DIR / "tokenizer_2") - scheduler.save_pretrained(MODELS_DIR / "scheduler") - else: - ( - controlnet, - unet, - vae, - text_encoder, - text_encoder_2, - tokenizer, - tokenizer_2, - image_proj_model, - scheduler, - ) = (None, None, None, None, None, None, None, None, None) - - gc.collect(); - -Convert InstantID pipeline components to OpenVINO Intermediate Representation format ------------------------------------------------------------------------------------- - - - -Starting from 2023.0 release, OpenVINO supports PyTorch models -conversion directly. We need to provide a model object, input data for -model tracing to ``ov.convert_model`` function to obtain OpenVINO -``ov.Model`` object instance. Model can be saved on disk for next -deployment using ``ov.save_model`` function. - -The pipeline consists of the following list of important parts: - -- Image Projection model for getting image prompt embeddings. It is - similar with IP-Adapter approach described in `this - tutorial `__, - but instead of image, it uses face embeddings as input for image - prompt encoding. -- Text Encoders for creating text embeddings to generate an image from - a text prompt. -- ControlNet for conditioning by face keypoints image for translation - face pose on generated image. -- Unet for step-by-step denoising latent image representation. -- Autoencoder (VAE) for decoding latent space to image. - -ControlNet -~~~~~~~~~~ - - - -ControlNet was introduced in `Adding Conditional Control to -Text-to-Image Diffusion Models `__ -paper. It provides a framework that enables support for various spatial -contexts such as a depth map, a segmentation map, a scribble, and key -points that can serve as additional conditionings to Diffusion models -such as Stable Diffusion. In this -`tutorial `__ -we already considered how to convert and use ControlNet with Stable -Diffusion pipeline. The process of usage ControlNet for Stable Diffusion -XL remains without changes. - -.. code:: ipython3 - - import openvino as ov - from functools import partial - - - def cleanup_torchscript_cache(): - """ - Helper for removing cached model representation - """ - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - - - controlnet_example_input = { - "sample": torch.ones((2, 4, 100, 100)), - "timestep": torch.tensor(1, dtype=torch.float32), - "encoder_hidden_states": torch.randn((2, 77, 2048)), - "controlnet_cond": torch.randn((2, 3, 800, 800)), - "conditioning_scale": torch.tensor(0.8, dtype=torch.float32), - "added_cond_kwargs": { - "text_embeds": torch.zeros((2, 1280)), - "time_ids": torch.ones((2, 6), dtype=torch.int32), - }, - } - - - if not ov_controlnet_path.exists(): - controlnet.forward = partial(controlnet.forward, return_dict=False) - with torch.no_grad(): - ov_controlnet = ov.convert_model(controlnet, example_input=controlnet_example_input) - ov_controlnet.inputs[-1].get_node().set_element_type(ov.Type.f32) - ov_controlnet.inputs[-1].get_node().set_partial_shape(ov.PartialShape([-1, 6])) - ov_controlnet.validate_nodes_and_infer_types() - ov.save_model(ov_controlnet, ov_controlnet_path) - cleanup_torchscript_cache() - del ov_controlnet - gc.collect() - - if not ov_unet_path.exists(): - out = controlnet(**controlnet_example_input) - down_block_res_samples, mid_block_res_sample = out[0], out[1] - else: - down_block_res_samples, mid_block_res_sample = None, None - - del controlnet - gc.collect(); - -Unet -~~~~ - - - -Compared with Stable Diffusion, Stable Diffusion XL Unet has an -additional input for the ``time_ids`` condition. As we use ControlNet -and Image Projection Model, these models’ outputs also contribute to -preparing model input for Unet. - -.. code:: ipython3 - - from typing import Tuple - - - class UnetWrapper(torch.nn.Module): - def __init__( - self, - unet, - sample_dtype=torch.float32, - timestep_dtype=torch.int64, - encoder_hidden_states_dtype=torch.float32, - down_block_additional_residuals_dtype=torch.float32, - mid_block_additional_residual_dtype=torch.float32, - text_embeds_dtype=torch.float32, - time_ids_dtype=torch.int32, - ): - super().__init__() - self.unet = unet - self.sample_dtype = sample_dtype - self.timestep_dtype = timestep_dtype - self.encoder_hidden_states_dtype = encoder_hidden_states_dtype - self.down_block_additional_residuals_dtype = down_block_additional_residuals_dtype - self.mid_block_additional_residual_dtype = mid_block_additional_residual_dtype - self.text_embeds_dtype = text_embeds_dtype - self.time_ids_dtype = time_ids_dtype - - def forward( - self, - sample: torch.Tensor, - timestep: torch.Tensor, - encoder_hidden_states: torch.Tensor, - down_block_additional_residuals: Tuple[torch.Tensor], - mid_block_additional_residual: torch.Tensor, - text_embeds: torch.Tensor, - time_ids: torch.Tensor, - ): - sample.to(self.sample_dtype) - timestep.to(self.timestep_dtype) - encoder_hidden_states.to(self.encoder_hidden_states_dtype) - down_block_additional_residuals = [res.to(self.down_block_additional_residuals_dtype) for res in down_block_additional_residuals] - mid_block_additional_residual.to(self.mid_block_additional_residual_dtype) - added_cond_kwargs = { - "text_embeds": text_embeds.to(self.text_embeds_dtype), - "time_ids": time_ids.to(self.time_ids_dtype), - } - - return self.unet( - sample, - timestep, - encoder_hidden_states, - down_block_additional_residuals=down_block_additional_residuals, - mid_block_additional_residual=mid_block_additional_residual, - added_cond_kwargs=added_cond_kwargs, - ) - - - if not ov_unet_path.exists(): - unet_example_input = { - "sample": torch.ones((2, 4, 100, 100)), - "timestep": torch.tensor(1, dtype=torch.float32), - "encoder_hidden_states": torch.randn((2, 77, 2048)), - "down_block_additional_residuals": down_block_res_samples, - "mid_block_additional_residual": mid_block_res_sample, - "text_embeds": torch.zeros((2, 1280)), - "time_ids": torch.ones((2, 6), dtype=torch.int32), - } - unet = UnetWrapper(unet) - with torch.no_grad(): - ov_unet = ov.convert_model(unet, example_input=unet_example_input) - for i in range(3, len(ov_unet.inputs) - 2): - ov_unet.inputs[i].get_node().set_element_type(ov.Type.f32) - - ov_unet.validate_nodes_and_infer_types() - ov.save_model(ov_unet, ov_unet_path) - del ov_unet - cleanup_torchscript_cache() - gc.collect() - - del unet - gc.collect(); - -VAE Decoder -~~~~~~~~~~~ - - - -The VAE model has two parts, an encoder and a decoder. The encoder is -used to convert the image into a low dimensional latent representation, -which will serve as the input to the U-Net model. The decoder, -conversely, transforms the latent representation back into an image. For -InstantID pipeline we will use VAE only for decoding Unet generated -image, it means that we can skip VAE encoder part conversion. - -.. code:: ipython3 - - class VAEDecoderWrapper(torch.nn.Module): - def __init__(self, vae_decoder): - super().__init__() - self.vae = vae_decoder - - def forward(self, latents): - return self.vae.decode(latents) - - - if not ov_vae_decoder_path.exists(): - vae_decoder = VAEDecoderWrapper(vae) - - with torch.no_grad(): - ov_vae_decoder = ov.convert_model(vae_decoder, example_input=torch.zeros((1, 4, 64, 64))) - ov.save_model(ov_vae_decoder, ov_vae_decoder_path) - del ov_vae_decoder - cleanup_torchscript_cache() - del vae_decoder - gc.collect() - - del vae - gc.collect(); - -Text Encoders -~~~~~~~~~~~~~ - - - -The text-encoder is responsible for transforming the input prompt, for -example, “a photo of an astronaut riding a horse” into an embedding -space that can be understood by the U-Net. It is usually a simple -transformer-based encoder that maps a sequence of input tokens to a -sequence of latent text embeddings. - -.. code:: ipython3 - - import types - - inputs = {"input_ids": torch.ones((1, 77), dtype=torch.long)} - - if not ov_text_encoder_path.exists(): - text_encoder.eval() - text_encoder.config.output_hidden_states = True - text_encoder.config.return_dict = False - with torch.no_grad(): - ov_text_encoder = ov.convert_model(text_encoder, example_input=inputs) - ov.save_model(ov_text_encoder, ov_text_encoder_path) - del ov_text_encoder - cleanup_torchscript_cache() - gc.collect() - - del text_encoder - gc.collect() - - - def text_encoder_fwd_wrapper(self, input_ids): - res = self._orig_forward(input_ids, return_dict=True, output_hidden_states=True) - return tuple([v for v in res.values() if v is not None]) - - - if not ov_text_encoder_2_path.exists(): - text_encoder_2.eval() - text_encoder_2._orig_forward = text_encoder_2.forward - text_encoder_2.forward = types.MethodType(text_encoder_fwd_wrapper, text_encoder_2) - - with torch.no_grad(): - ov_text_encoder = ov.convert_model(text_encoder_2, example_input=inputs) - ov.save_model(ov_text_encoder, ov_text_encoder_2_path) - del ov_text_encoder - cleanup_torchscript_cache() - del text_encoder_2 - gc.collect(); - -Image Projection Model -~~~~~~~~~~~~~~~~~~~~~~ - - - -Image projection model is responsible to transforming face embeddings to -image prompt embeddings - -.. code:: ipython3 - - if not ov_image_proj_encoder_path.exists(): - with torch.no_grad(): - ov_image_encoder = ov.convert_model(image_proj_model, example_input=torch.zeros((2, 1, 512))) - ov.save_model(ov_image_encoder, ov_image_proj_encoder_path) - del ov_image_encoder - cleanup_torchscript_cache() - del image_proj_model - gc.collect(); - -Prepare OpenVINO InstantID Pipeline ------------------------------------ - - - -.. code:: ipython3 - - import numpy as np - from diffusers import StableDiffusionXLControlNetPipeline - from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput - from typing import Any, Callable, Dict, List, Optional, Tuple, Union - - import torch - - from diffusers.image_processor import PipelineImageInput, VaeImageProcessor - - - class OVStableDiffusionXLInstantIDPipeline(StableDiffusionXLControlNetPipeline): - def __init__( - self, - text_encoder, - text_encoder_2, - image_proj_model, - controlnet, - unet, - vae_decoder, - tokenizer, - tokenizer_2, - scheduler, - ): - self.text_encoder = text_encoder - self.text_encoder_2 = text_encoder_2 - self.tokenizer = tokenizer - self.tokenizer_2 = tokenizer_2 - self.image_proj_model = image_proj_model - self.controlnet = controlnet - self.unet = unet - self.vae_decoder = vae_decoder - self.scheduler = scheduler - self.image_proj_model_in_features = 512 - self.vae_scale_factor = 8 - self.vae_scaling_factor = 0.13025 - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True) - self.control_image_processor = VaeImageProcessor( - vae_scale_factor=self.vae_scale_factor, - do_convert_rgb=True, - do_normalize=False, - ) - self._internal_dict = {} - self._progress_bar_config = {} - - def _encode_prompt_image_emb(self, prompt_image_emb, num_images_per_prompt, do_classifier_free_guidance): - if isinstance(prompt_image_emb, torch.Tensor): - prompt_image_emb = prompt_image_emb.clone().detach() - else: - prompt_image_emb = torch.tensor(prompt_image_emb) - - prompt_image_emb = prompt_image_emb.reshape([1, -1, self.image_proj_model_in_features]) - - if do_classifier_free_guidance: - prompt_image_emb = torch.cat([torch.zeros_like(prompt_image_emb), prompt_image_emb], dim=0) - else: - prompt_image_emb = torch.cat([prompt_image_emb], dim=0) - prompt_image_emb = self.image_proj_model(prompt_image_emb)[0] - - bs_embed, seq_len, _ = prompt_image_emb.shape - prompt_image_emb = np.tile(prompt_image_emb, (1, num_images_per_prompt, 1)) - prompt_image_emb = prompt_image_emb.reshape(bs_embed * num_images_per_prompt, seq_len, -1) - - return prompt_image_emb - - def __call__( - self, - prompt: Union[str, List[str]] = None, - prompt_2: Optional[Union[str, List[str]]] = None, - image: PipelineImageInput = None, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 50, - guidance_scale: float = 5.0, - negative_prompt: Optional[Union[str, List[str]]] = None, - negative_prompt_2: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: Optional[int] = 1, - eta: float = 0.0, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - image_embeds: Optional[torch.FloatTensor] = None, - output_type: Optional[str] = "pil", - return_dict: bool = True, - cross_attention_kwargs: Optional[Dict[str, Any]] = None, - controlnet_conditioning_scale: Union[float, List[float]] = 1.0, - guess_mode: bool = False, - control_guidance_start: Union[float, List[float]] = 0.0, - control_guidance_end: Union[float, List[float]] = 1.0, - original_size: Tuple[int, int] = None, - crops_coords_top_left: Tuple[int, int] = (0, 0), - target_size: Tuple[int, int] = None, - negative_original_size: Optional[Tuple[int, int]] = None, - negative_crops_coords_top_left: Tuple[int, int] = (0, 0), - negative_target_size: Optional[Tuple[int, int]] = None, - clip_skip: Optional[int] = None, - callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, - callback_on_step_end_tensor_inputs: List[str] = ["latents"], - # IP adapter - ip_adapter_scale=None, - **kwargs, - ): - r""" - The call function to the pipeline for generation. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - prompt_2 (`str` or `List[str]`, *optional*): - The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is - used in both text-encoders. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): - The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be - accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height__module.unet.up_blocks.0.upsamplers.0.conv.base_layer/aten::_convolu - and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in - `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. - height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): - The height in pixels of the generated image. Anything below 512 pixels won't work well for - [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) - and checkpoints that are not specifically fine-tuned on low resolutions. - width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): - The width in pixels of the generated image. Anything below 512 pixels won't work well for - [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) - and checkpoints that are not specifically fine-tuned on low resolutions. - num_inference_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (`float`, *optional*, defaults to 5.0): - A higher guidance scale value encourages the model to generate images closely linked to the text - `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide what to not include in image generation. If not defined, you need to - pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). - negative_prompt_2 (`str` or `List[str]`, *optional*): - The prompt or prompts to guide what to not include in image generation. This is sent to `tokenizer_2` - and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders. - num_images_per_prompt (`int`, *optional*, defaults to 1): - The number of images to generate per prompt. - eta (`float`, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies - to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make - generation deterministic. - latents (`torch.FloatTensor`, *optional*): - Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not - provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If - not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If - not provided, pooled text embeddings are generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt - weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input - argument. - image_embeds (`torch.FloatTensor`, *optional*): - Pre-generated image embeddings. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated image. Choose between `PIL.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0): - The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added - to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set - the corresponding scale as a list. - control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0): - The percentage of total steps at which the ControlNet starts applying. - control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0): - The percentage of total steps at which the ControlNet stops applying. - original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): - If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled. - `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as - explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): - `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position - `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting - `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): - For most cases, `target_size` should be set to the desired height and width of the generated image. If - not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in - section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): - To negatively condition the generation process based on a specific image resolution. Part of SDXL's - micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more - information, refer toencode_pro this issue thread: https://github.com/huggingface/diffusers/issues/4208. - negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): - To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's - micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more - information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. - negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): - To negatively condition the generation process based on a target image resolution. It should be as same - as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more - information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. - clip_skip (`int`, *optional*): - Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that - the output of the pre-final layer will be used for computing the prompt embeddings. - - Examples: - - Returns: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, - otherwise a `tuple` is returned containing the output images. - """ - - do_classifier_free_guidance = guidance_scale >= 1.0 - # align format for control guidance - if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list): - control_guidance_start = len(control_guidance_end) * [control_guidance_start] - elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list): - control_guidance_end = len(control_guidance_start) * [control_guidance_end] - elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list): - control_guidance_start, control_guidance_end = ( - [control_guidance_start], - [control_guidance_end], - ) - - # 2. Define call parameters - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - ( - prompt_embeds, - negative_prompt_embeds, - pooled_prompt_embeds, - negative_pooled_prompt_embeds, - ) = self.encode_prompt( - prompt, - prompt_2, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, - negative_prompt_2, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - pooled_prompt_embeds=pooled_prompt_embeds, - negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, - lora_scale=None, - clip_skip=clip_skip, - ) - - # 3.2 Encode image prompt - prompt_image_emb = self._encode_prompt_image_emb(image_embeds, num_images_per_prompt, do_classifier_free_guidance) - - # 4. Prepare image - image = self.prepare_image( - image=image, - width=width, - height=height, - batch_size=batch_size * num_images_per_prompt, - num_images_per_prompt=num_images_per_prompt, - do_classifier_free_guidance=do_classifier_free_guidance, - guess_mode=guess_mode, - ) - height, width = image.shape[-2:] - - # 5. Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps) - timesteps = self.scheduler.timesteps - - # 6. Prepare latent variables - num_channels_latents = 4 - latents = self.prepare_latents( - int(batch_size) * int(num_images_per_prompt), - int(num_channels_latents), - int(height), - int(width), - dtype=torch.float32, - device=torch.device("cpu"), - generator=generator, - latents=latents, - ) - - # 7. Prepare extra step kwargs. - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - # 7.1 Create tensor stating which controlnets to keep - controlnet_keep = [] - for i in range(len(timesteps)): - keeps = [1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e) for s, e in zip(control_guidance_start, control_guidance_end)] - controlnet_keep.append(keeps) - - # 7.2 Prepare added time ids & embeddings - if isinstance(image, list): - original_size = original_size or image[0].shape[-2:] - else: - original_size = original_size or image.shape[-2:] - target_size = target_size or (height, width) - - add_text_embeds = pooled_prompt_embeds - if self.text_encoder_2 is None: - text_encoder_projection_dim = pooled_prompt_embeds.shape[-1] - else: - text_encoder_projection_dim = 1280 - - add_time_ids = self._get_add_time_ids( - original_size, - crops_coords_top_left, - target_size, - text_encoder_projection_dim=text_encoder_projection_dim, - ) - - if negative_original_size is not None and negative_target_size is not None: - negative_add_time_ids = self._get_add_time_ids( - negative_original_size, - negative_crops_coords_top_left, - negative_target_size, - text_encoder_projection_dim=text_encoder_projection_dim, - ) - else: - negative_add_time_ids = add_time_ids - - if do_classifier_free_guidance: - prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds], axis=0) - add_text_embeds = np.concatenate([negative_pooled_prompt_embeds, add_text_embeds], axis=0) - add_time_ids = np.concatenate([negative_add_time_ids, add_time_ids], axis=0) - - add_time_ids = np.tile(add_time_ids, (batch_size * num_images_per_prompt, 1)) - encoder_hidden_states = np.concatenate([prompt_embeds, prompt_image_emb], axis=1) - - # 8. Denoising loop - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - - # controlnet(s) inference - control_model_input = latent_model_input - - cond_scale = controlnet_conditioning_scale - - controlnet_outputs = self.controlnet( - [ - control_model_input, - t, - prompt_image_emb, - image, - cond_scale, - add_text_embeds, - add_time_ids, - ] - ) - - controlnet_additional_blocks = list(controlnet_outputs.values()) - - # predict the noise residual - noise_pred = self.unet( - [ - latent_model_input, - t, - encoder_hidden_states, - *controlnet_additional_blocks, - add_text_embeds, - add_time_ids, - ] - )[0] - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1] - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step( - torch.from_numpy(noise_pred), - t, - latents, - **extra_step_kwargs, - return_dict=False, - )[0] - progress_bar.update() - - if not output_type == "latent": - image = self.vae_decoder(latents / self.vae_scaling_factor)[0] - else: - image = latents - - if not output_type == "latent": - image = self.image_processor.postprocess(torch.from_numpy(image), output_type=output_type) - - if not return_dict: - return (image,) - - return StableDiffusionXLPipelineOutput(images=image) - - def encode_prompt( - self, - prompt: str, - prompt_2: Optional[str] = None, - num_images_per_prompt: int = 1, - do_classifier_free_guidance: bool = True, - negative_prompt: Optional[str] = None, - negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - lora_scale: Optional[float] = None, - clip_skip: Optional[int] = None, - ): - r""" - Encodes the prompt into text encoder hidden states. - - Args: - prompt (`str` or `List[str]`, *optional*): - prompt to be encoded - prompt_2 (`str` or `List[str]`, *optional*): - The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is - used in both text-encoders - num_images_per_prompt (`int`): - number of images that should be generated per prompt - do_classifier_free_guidance (`bool`): - whether to use classifier free guidance or not - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - negative_prompt_2 (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and - `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. - If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` - input argument. - lora_scale (`float`, *optional*): - A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. - clip_skip (`int`, *optional*): - Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that - the output of the pre-final layer will be used for computing the prompt embeddings. - """ - prompt = [prompt] if isinstance(prompt, str) else prompt - - if prompt is not None: - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - # Define tokenizers and text encoders - tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2] - text_encoders = [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2] - - if prompt_embeds is None: - prompt_2 = prompt_2 or prompt - prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2 - - # textual inversion: procecss multi-vector tokens if necessary - prompt_embeds_list = [] - prompts = [prompt, prompt_2] - for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders): - text_inputs = tokenizer( - prompt, - padding="max_length", - max_length=tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - - text_input_ids = text_inputs.input_ids - - prompt_embeds = text_encoder(text_input_ids) - - # We are only ALWAYS interested in the pooled output of the final text encoder - pooled_prompt_embeds = prompt_embeds[0] - hidden_states = list(prompt_embeds.values())[1:] - if clip_skip is None: - prompt_embeds = hidden_states[-2] - else: - # "2" because SDXL always indexes from the penultimate layer. - prompt_embeds = hidden_states[-(clip_skip + 2)] - - prompt_embeds_list.append(prompt_embeds) - - prompt_embeds = np.concatenate(prompt_embeds_list, axis=-1) - - # get unconditional embeddings for classifier free guidance - zero_out_negative_prompt = negative_prompt is None - if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt: - negative_prompt_embeds = np.zeros_like(prompt_embeds) - negative_pooled_prompt_embeds = np.zeros_like(pooled_prompt_embeds) - elif do_classifier_free_guidance and negative_prompt_embeds is None: - negative_prompt = negative_prompt or "" - negative_prompt_2 = negative_prompt_2 or negative_prompt - - # normalize str to list - negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt - negative_prompt_2 = batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2 - - uncond_tokens: List[str] - if prompt is not None and type(prompt) is not type(negative_prompt): - raise TypeError(f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" f" {type(prompt)}.") - elif batch_size != len(negative_prompt): - raise ValueError( - f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" - f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`." - ) - else: - uncond_tokens = [negative_prompt, negative_prompt_2] - - negative_prompt_embeds_list = [] - for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders): - max_length = prompt_embeds.shape[1] - uncond_input = tokenizer( - negative_prompt, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="pt", - ) - - negative_prompt_embeds = text_encoder(uncond_input.input_ids) - # We are only ALWAYS interested in the pooled output of the final text encoder - negative_pooled_prompt_embeds = negative_prompt_embeds[0] - hidden_states = list(negative_prompt_embeds.values())[1:] - negative_prompt_embeds = hidden_states[-2] - - negative_prompt_embeds_list.append(negative_prompt_embeds) - - negative_prompt_embeds = np.concatenate(negative_prompt_embeds_list, axis=-1) - - bs_embed, seq_len, _ = prompt_embeds.shape - # duplicate text embeddings for each generation per prompt, using mps friendly method - prompt_embeds = np.tile(prompt_embeds, (1, num_images_per_prompt, 1)) - prompt_embeds = prompt_embeds.reshape(bs_embed * num_images_per_prompt, seq_len, -1) - - if do_classifier_free_guidance: - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = np.tile(negative_prompt_embeds, (1, num_images_per_prompt, 1)) - negative_prompt_embeds = negative_prompt_embeds.reshape(batch_size * num_images_per_prompt, seq_len, -1) - - pooled_prompt_embeds = np.tile(pooled_prompt_embeds, (1, num_images_per_prompt)).reshape(bs_embed * num_images_per_prompt, -1) - if do_classifier_free_guidance: - negative_pooled_prompt_embeds = np.tile(negative_pooled_prompt_embeds, (1, num_images_per_prompt)).reshape(bs_embed * num_images_per_prompt, -1) - - return ( - prompt_embeds, - negative_prompt_embeds, - pooled_prompt_embeds, - negative_pooled_prompt_embeds, - ) - - def prepare_image( - self, - image, - width, - height, - batch_size, - num_images_per_prompt, - do_classifier_free_guidance=False, - guess_mode=False, - ): - image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32) - image_batch_size = image.shape[0] - - if image_batch_size == 1: - repeat_by = batch_size - else: - # image batch size is the same as prompt batch size - repeat_by = num_images_per_prompt - - image = image.repeat_interleave(repeat_by, dim=0) - - if do_classifier_free_guidance and not guess_mode: - image = torch.cat([image] * 2) - - return image - - def _get_add_time_ids( - self, - original_size, - crops_coords_top_left, - target_size, - text_encoder_projection_dim, - ): - add_time_ids = list(original_size + crops_coords_top_left + target_size) - add_time_ids = torch.tensor([add_time_ids]) - return add_time_ids - -Run OpenVINO pipeline inference -------------------------------- - - - -Select inference device for InstantID -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Create pipeline -~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from transformers import AutoTokenizer - - - def create_ov_pipe( - text_encoder_path, - text_encoder_2_path, - image_proj_encoder_path, - controlnet_path, - unet_path, - vae_decoder_path, - tokenizer_path, - tokenizer_2_path, - scheduler_path, - ): - return OVStableDiffusionXLInstantIDPipeline( - core.compile_model(text_encoder_path, device.value), - core.compile_model(text_encoder_2_path, device.value), - core.compile_model(image_proj_encoder_path, device.value), - core.compile_model(controlnet_path, device.value), - core.compile_model(unet_path, device.value), - core.compile_model(vae_decoder_path, device.value), - AutoTokenizer.from_pretrained(tokenizer_path), - AutoTokenizer.from_pretrained(tokenizer_2_path), - LCMScheduler.from_pretrained(scheduler_path), - ) - - - ov_pipe = create_ov_pipe( - ov_text_encoder_path, - ov_text_encoder_2_path, - ov_image_proj_encoder_path, - ov_controlnet_path, - ov_unet_path, - ov_vae_decoder_path, - MODELS_DIR / "tokenizer", - MODELS_DIR / "tokenizer_2", - MODELS_DIR / "scheduler", - ) - -Run inference -~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - prompt = "Anime girl" - negative_prompt = "" - - image = ov_pipe( - prompt, - image_embeds=face_emb, - image=face_kps, - num_inference_steps=4, - negative_prompt=negative_prompt, - guidance_scale=0.5, - generator=torch.Generator(device="cpu").manual_seed(1749781188), - ).images[0] - - - -.. parsed-literal:: - - 0%| | 0/4 [00:00`__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -According to ``OVStableDiffusionXLInstantIDPipeline`` structure, -ControlNet and UNet models are used in the cycle repeating inference on -each diffusion step, while other parts of pipeline take part only once. -Now we will show you how to optimize pipeline using -`NNCF `__ to reduce memory and -computation cost. - -Please select below whether you would like to run quantization to -improve model inference speed. - - **NOTE**: Quantization is time and memory consuming operation. - Running quantization code below may take some time. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - skip_for_device = "GPU" in device.value or (device.value == "AUTO" and any("GPU" in device_name for device_name in core.available_devices)) - to_quantize = quantization_widget(not skip_for_device) - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -Let’s load ``skip magic`` extension to skip quantization if -``to_quantize`` is not selected - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - import requests - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - int8_pipe = None - - %load_ext skip_kernel_extension - -Prepare calibration datasets -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We use a portion of -`wider_face `__ dataset -from Hugging Face as calibration data. We use prompts below to guide -image generation and to determine what not to include in the resulting -image. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - negative_prompts = [ - "blurry unreal occluded", - "low contrast disfigured uncentered mangled", - "amateur out of frame low quality nsfw", - "ugly underexposed jpeg artifacts", - "low saturation disturbing content", - "overexposed severe distortion", - "amateur NSFW", - "ugly mutilated out of frame disfigured", - ] - prompts = [ - "a Naruto-style image of a young boy, incorporating dynamic action lines, intense energy effects, and a sense of movement and power", - "an anime-style girl, with vibrant, otherworldly colors, fantastical elements, and a sense of awe", - "analog film photo of a man. faded film, desaturated, 35mm photo, grainy, vignette, vintage, Kodachrome, Lomography, stained, highly detailed, found footage, masterpiece, best quality", - "Apply a staining filter to give the impression of aged, worn-out film while maintaining sharp detail on a portrait of a woman", - "a modern picture of a boy an antique feel through selective desaturation, grain addition, and a warm tone, mimicking the style of old photographs", - "a dreamy, ethereal portrait of a young girl, featuring soft, pastel colors, a blurred background, and a touch of bokeh", - "a dynamic, action-packed image of a boy in motion, using motion blur, panning, and other techniques to convey a sense of speed and energy", - "a dramatic, cinematic image of a boy, using color grading, contrast adjustments, and a widescreen aspect ratio, to create a sense of epic scale and grandeur", - "a portrait of a woman in the style of Picasso's cubism, featuring fragmented shapes, bold lines, and a vibrant color palette", - "an artwork in the style of Picasso's Blue Period, featuring a somber, melancholic portrait of a person, with muted colors, elongated forms, and a sense of introspection and contemplation", - ] - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import datasets - - num_inference_steps = 4 - subset_size = 200 - - ov_int8_unet_path = MODELS_DIR / 'unet_optimized.xml' - ov_int8_controlnet_path = MODELS_DIR / 'controlnet_optimized.xml' - - num_samples = int(np.ceil(subset_size / num_inference_steps)) - dataset = datasets.load_dataset("wider_face", split="train", streaming=True, trust_remote_code=True).shuffle(seed=42) - face_info = [] - for batch in dataset: - try: - face_info.append(get_face_info(batch["image"])) - except RuntimeError: - continue - if len(face_info) > num_samples: - break - -To collect intermediate model inputs for calibration we should customize -``CompiledModel``. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from tqdm.notebook import tqdm - from transformers import set_seed - - set_seed(42) - - class CompiledModelDecorator(ov.CompiledModel): - def __init__(self, compiled_model: ov.CompiledModel, keep_prob: float = 1.0): - super().__init__(compiled_model) - self.data_cache = [] - self.keep_prob = np.clip(keep_prob, 0, 1) - - def __call__(self, *args, **kwargs): - if np.random.rand() <= self.keep_prob: - self.data_cache.append(*args) - return super().__call__(*args, **kwargs) - - - def collect_calibration_data(pipeline, face_info, subset_size): - original_unet = pipeline.unet - pipeline.unet = CompiledModelDecorator(original_unet) - pipeline.set_progress_bar_config(disable=True) - - pbar = tqdm(total=subset_size) - for face_emb, face_kps in face_info: - negative_prompt = np.random.choice(negative_prompts) - prompt = np.random.choice(prompts) - _ = pipeline( - prompt, - image_embeds=face_emb, - image=face_kps, - num_inference_steps=num_inference_steps, - negative_prompt=negative_prompt, - guidance_scale=0.5, - generator=torch.Generator(device="cpu").manual_seed(1749781188) - ) - collected_subset_size = len(pipeline.unet.data_cache) - pbar.update(collected_subset_size - pbar.n) - - calibration_dataset = pipeline.unet.data_cache[:subset_size] - pipeline.set_progress_bar_config(disable=False) - pipeline.unet = original_unet - return calibration_dataset - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - if not (ov_int8_unet_path.exists() and ov_int8_controlnet_path.exists()): - unet_calibration_data = collect_calibration_data(ov_pipe, face_info, subset_size=subset_size) - - - -.. parsed-literal:: - - 0%| | 0/200 [00:00 bool: - allowed_types_list = ["f16", "f32", "f64"] - const_port_id = 0 - input_tensor = node.input_value(const_port_id) - if input_tensor.get_element_type().get_type_name() in allowed_types_list: - const_node = get_operation_const_op(node, const_port_id) - if const_node is not None: - return True - - return False - - - def collect_ops_with_weights(model): - ops_with_weights = [] - for op in model.get_ops(): - if op.get_type_name() == "MatMul": - constant_node_0 = get_operation_const_op(op, const_port_id=0) - constant_node_1 = get_operation_const_op(op, const_port_id=1) - if constant_node_0 or constant_node_1: - ops_with_weights.append(op.get_friendly_name()) - if op.get_type_name() == "Gather" and is_embedding(op): - ops_with_weights.append(op.get_friendly_name()) - - return ops_with_weights - -.. code:: ipython3 - - %%skip not $to_quantize.value - - if not ov_int8_unet_path.exists(): - unet = core.read_model(ov_unet_path) - unet_ignored_scope = collect_ops_with_weights(unet) - compressed_unet = nncf.compress_weights(unet, ignored_scope=nncf.IgnoredScope(types=['Convolution'])) - quantized_unet = nncf.quantize( - model=compressed_unet, - calibration_dataset=nncf.Dataset(unet_calibration_data), - subset_size=subset_size, - model_type=nncf.ModelType.TRANSFORMER, - ignored_scope=nncf.IgnoredScope(names=unet_ignored_scope), - advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=-1) - ) - ov.save_model(quantized_unet, ov_int8_unet_path) - - -.. parsed-literal:: - - INFO:nncf:51 ignored nodes were found by types in the NNCFGraph - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (883 / 883) │ 100% (883 / 883) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - /home/ea/work/py3.11/lib/python3.11/site-packages/nncf/quantization/algorithms/post_training/pipeline.py:87: FutureWarning: `AdvancedQuantizationParameters(smooth_quant_alpha=..)` is deprecated.Please, use `AdvancedQuantizationParameters(smooth_quant_alphas)` option with AdvancedSmoothQuantParameters(convolution=.., matmul=..) as value instead. - warning_deprecated( - - -.. parsed-literal:: - - INFO:nncf:883 ignored nodes were found by names in the NNCFGraph - INFO:nncf:Not adding activation input quantizer for operation: 100 __module.unet.mid_block.attentions.0.transformer_blocks.6.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 101 __module.unet.mid_block.attentions.0.transformer_blocks.7.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 102 __module.unet.mid_block.attentions.0.transformer_blocks.7.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 103 __module.unet.mid_block.attentions.0.transformer_blocks.8.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 104 __module.unet.mid_block.attentions.0.transformer_blocks.8.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 105 __module.unet.mid_block.attentions.0.transformer_blocks.9.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 106 __module.unet.mid_block.attentions.0.transformer_blocks.9.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 107 __module.unet.up_blocks.0.attentions.0.transformer_blocks.0.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 108 __module.unet.up_blocks.0.attentions.0.transformer_blocks.0.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 109 __module.unet.up_blocks.0.attentions.0.transformer_blocks.1.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 110 __module.unet.up_blocks.0.attentions.0.transformer_blocks.1.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 111 __module.unet.up_blocks.0.attentions.0.transformer_blocks.2.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 112 __module.unet.up_blocks.0.attentions.0.transformer_blocks.2.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 113 __module.unet.up_blocks.0.attentions.0.transformer_blocks.3.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 114 __module.unet.up_blocks.0.attentions.0.transformer_blocks.3.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 115 __module.unet.up_blocks.0.attentions.0.transformer_blocks.4.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 116 __module.unet.up_blocks.0.attentions.0.transformer_blocks.4.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 117 __module.unet.up_blocks.0.attentions.0.transformer_blocks.5.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 118 __module.unet.up_blocks.0.attentions.0.transformer_blocks.5.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 119 __module.unet.up_blocks.0.attentions.0.transformer_blocks.6.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 120 __module.unet.up_blocks.0.attentions.0.transformer_blocks.6.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 121 __module.unet.up_blocks.0.attentions.0.transformer_blocks.7.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 122 __module.unet.up_blocks.0.attentions.0.transformer_blocks.7.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 123 __module.unet.up_blocks.0.attentions.0.transformer_blocks.8.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 124 __module.unet.up_blocks.0.attentions.0.transformer_blocks.8.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 125 __module.unet.up_blocks.0.attentions.0.transformer_blocks.9.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 126 __module.unet.up_blocks.0.attentions.0.transformer_blocks.9.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 127 __module.unet.up_blocks.0.attentions.1.transformer_blocks.0.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 128 __module.unet.up_blocks.0.attentions.1.transformer_blocks.0.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 129 __module.unet.up_blocks.0.attentions.1.transformer_blocks.1.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 130 __module.unet.up_blocks.0.attentions.1.transformer_blocks.1.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 131 __module.unet.up_blocks.0.attentions.1.transformer_blocks.2.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 132 __module.unet.up_blocks.0.attentions.1.transformer_blocks.2.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 133 __module.unet.up_blocks.0.attentions.1.transformer_blocks.3.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 134 __module.unet.up_blocks.0.attentions.1.transformer_blocks.3.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 135 __module.unet.up_blocks.0.attentions.1.transformer_blocks.4.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 136 __module.unet.up_blocks.0.attentions.1.transformer_blocks.4.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 137 __module.unet.up_blocks.0.attentions.1.transformer_blocks.5.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 138 __module.unet.up_blocks.0.attentions.1.transformer_blocks.5.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 139 __module.unet.up_blocks.0.attentions.1.transformer_blocks.6.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 140 __module.unet.up_blocks.0.attentions.1.transformer_blocks.6.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 141 __module.unet.up_blocks.0.attentions.1.transformer_blocks.7.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 142 __module.unet.up_blocks.0.attentions.1.transformer_blocks.7.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 143 __module.unet.up_blocks.0.attentions.1.transformer_blocks.8.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 144 __module.unet.up_blocks.0.attentions.1.transformer_blocks.8.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 145 __module.unet.up_blocks.0.attentions.1.transformer_blocks.9.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 146 __module.unet.up_blocks.0.attentions.1.transformer_blocks.9.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 147 __module.unet.up_blocks.0.attentions.2.transformer_blocks.0.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 148 __module.unet.up_blocks.0.attentions.2.transformer_blocks.0.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 149 __module.unet.up_blocks.0.attentions.2.transformer_blocks.1.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 150 __module.unet.up_blocks.0.attentions.2.transformer_blocks.1.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 151 __module.unet.up_blocks.0.attentions.2.transformer_blocks.2.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 152 __module.unet.up_blocks.0.attentions.2.transformer_blocks.2.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 153 __module.unet.up_blocks.0.attentions.2.transformer_blocks.3.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 154 __module.unet.up_blocks.0.attentions.2.transformer_blocks.3.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 155 __module.unet.up_blocks.0.attentions.2.transformer_blocks.4.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 156 __module.unet.up_blocks.0.attentions.2.transformer_blocks.4.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 157 __module.unet.up_blocks.0.attentions.2.transformer_blocks.5.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 158 __module.unet.up_blocks.0.attentions.2.transformer_blocks.5.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 159 __module.unet.up_blocks.0.attentions.2.transformer_blocks.6.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 160 __module.unet.up_blocks.0.attentions.2.transformer_blocks.6.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 161 __module.unet.up_blocks.0.attentions.2.transformer_blocks.7.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 162 __module.unet.up_blocks.0.attentions.2.transformer_blocks.7.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 163 __module.unet.up_blocks.0.attentions.2.transformer_blocks.8.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 164 __module.unet.up_blocks.0.attentions.2.transformer_blocks.8.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 165 __module.unet.up_blocks.0.attentions.2.transformer_blocks.9.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 166 __module.unet.up_blocks.0.attentions.2.transformer_blocks.9.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 167 __module.unet.up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 168 __module.unet.up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 169 __module.unet.up_blocks.1.attentions.0.transformer_blocks.1.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 170 __module.unet.up_blocks.1.attentions.0.transformer_blocks.1.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 171 __module.unet.up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 172 __module.unet.up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 173 __module.unet.up_blocks.1.attentions.1.transformer_blocks.1.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 174 __module.unet.up_blocks.1.attentions.1.transformer_blocks.1.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 175 __module.unet.up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 176 __module.unet.up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 177 __module.unet.up_blocks.1.attentions.2.transformer_blocks.1.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 178 __module.unet.up_blocks.1.attentions.2.transformer_blocks.1.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 179 __module.unet.down_blocks.1.attentions.0.transformer_blocks.0.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 180 __module.unet.down_blocks.1.attentions.0.transformer_blocks.0.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 181 __module.unet.down_blocks.1.attentions.0.transformer_blocks.1.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 182 __module.unet.down_blocks.1.attentions.0.transformer_blocks.1.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 183 __module.unet.down_blocks.1.attentions.1.transformer_blocks.0.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 184 __module.unet.down_blocks.1.attentions.1.transformer_blocks.0.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 185 __module.unet.down_blocks.1.attentions.1.transformer_blocks.1.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 186 __module.unet.down_blocks.1.attentions.1.transformer_blocks.1.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 187 __module.unet.down_blocks.2.attentions.0.transformer_blocks.0.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 188 __module.unet.down_blocks.2.attentions.0.transformer_blocks.0.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 189 __module.unet.down_blocks.2.attentions.0.transformer_blocks.1.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 190 __module.unet.down_blocks.2.attentions.0.transformer_blocks.1.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 191 __module.unet.down_blocks.2.attentions.0.transformer_blocks.2.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 192 __module.unet.down_blocks.2.attentions.0.transformer_blocks.2.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 193 __module.unet.down_blocks.2.attentions.0.transformer_blocks.3.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 194 __module.unet.down_blocks.2.attentions.0.transformer_blocks.3.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 195 __module.unet.down_blocks.2.attentions.0.transformer_blocks.4.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 196 __module.unet.down_blocks.2.attentions.0.transformer_blocks.4.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 197 __module.unet.down_blocks.2.attentions.0.transformer_blocks.5.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 198 __module.unet.down_blocks.2.attentions.0.transformer_blocks.5.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 199 __module.unet.down_blocks.2.attentions.0.transformer_blocks.6.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 200 __module.unet.down_blocks.2.attentions.0.transformer_blocks.6.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 201 __module.unet.down_blocks.2.attentions.0.transformer_blocks.7.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 202 __module.unet.down_blocks.2.attentions.0.transformer_blocks.7.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 203 __module.unet.down_blocks.2.attentions.0.transformer_blocks.8.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 204 __module.unet.down_blocks.2.attentions.0.transformer_blocks.8.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 205 __module.unet.down_blocks.2.attentions.0.transformer_blocks.9.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 206 __module.unet.down_blocks.2.attentions.0.transformer_blocks.9.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 207 __module.unet.down_blocks.2.attentions.1.transformer_blocks.0.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 208 __module.unet.down_blocks.2.attentions.1.transformer_blocks.0.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 209 __module.unet.down_blocks.2.attentions.1.transformer_blocks.1.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 210 __module.unet.down_blocks.2.attentions.1.transformer_blocks.1.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 211 __module.unet.down_blocks.2.attentions.1.transformer_blocks.2.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 212 __module.unet.down_blocks.2.attentions.1.transformer_blocks.2.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 213 __module.unet.down_blocks.2.attentions.1.transformer_blocks.3.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 214 __module.unet.down_blocks.2.attentions.1.transformer_blocks.3.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 215 __module.unet.down_blocks.2.attentions.1.transformer_blocks.4.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 216 __module.unet.down_blocks.2.attentions.1.transformer_blocks.4.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 217 __module.unet.down_blocks.2.attentions.1.transformer_blocks.5.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 218 __module.unet.down_blocks.2.attentions.1.transformer_blocks.5.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 219 __module.unet.down_blocks.2.attentions.1.transformer_blocks.6.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 220 __module.unet.down_blocks.2.attentions.1.transformer_blocks.6.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 221 __module.unet.down_blocks.2.attentions.1.transformer_blocks.7.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 222 __module.unet.down_blocks.2.attentions.1.transformer_blocks.7.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 223 __module.unet.down_blocks.2.attentions.1.transformer_blocks.8.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 224 __module.unet.down_blocks.2.attentions.1.transformer_blocks.8.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 225 __module.unet.down_blocks.2.attentions.1.transformer_blocks.9.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 226 __module.unet.down_blocks.2.attentions.1.transformer_blocks.9.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 227 __module.unet.mid_block.attentions.0.transformer_blocks.0.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 228 __module.unet.mid_block.attentions.0.transformer_blocks.0.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 229 __module.unet.mid_block.attentions.0.transformer_blocks.1.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 230 __module.unet.mid_block.attentions.0.transformer_blocks.1.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 231 __module.unet.mid_block.attentions.0.transformer_blocks.2.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 232 __module.unet.mid_block.attentions.0.transformer_blocks.2.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 233 __module.unet.mid_block.attentions.0.transformer_blocks.3.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 234 __module.unet.mid_block.attentions.0.transformer_blocks.3.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 235 __module.unet.mid_block.attentions.0.transformer_blocks.4.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 236 __module.unet.mid_block.attentions.0.transformer_blocks.4.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 237 __module.unet.mid_block.attentions.0.transformer_blocks.5.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 238 __module.unet.mid_block.attentions.0.transformer_blocks.5.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 239 __module.unet.mid_block.attentions.0.transformer_blocks.6.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 240 __module.unet.mid_block.attentions.0.transformer_blocks.6.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 241 __module.unet.mid_block.attentions.0.transformer_blocks.7.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 242 __module.unet.mid_block.attentions.0.transformer_blocks.7.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 243 __module.unet.mid_block.attentions.0.transformer_blocks.8.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 244 __module.unet.mid_block.attentions.0.transformer_blocks.8.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 245 __module.unet.mid_block.attentions.0.transformer_blocks.9.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 246 __module.unet.mid_block.attentions.0.transformer_blocks.9.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 247 __module.unet.up_blocks.0.attentions.0.transformer_blocks.0.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 248 __module.unet.up_blocks.0.attentions.0.transformer_blocks.0.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 249 __module.unet.up_blocks.0.attentions.0.transformer_blocks.1.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 250 __module.unet.up_blocks.0.attentions.0.transformer_blocks.1.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 251 __module.unet.up_blocks.0.attentions.0.transformer_blocks.2.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 252 __module.unet.up_blocks.0.attentions.0.transformer_blocks.2.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 253 __module.unet.up_blocks.0.attentions.0.transformer_blocks.3.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 254 __module.unet.up_blocks.0.attentions.0.transformer_blocks.3.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 255 __module.unet.up_blocks.0.attentions.0.transformer_blocks.4.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 256 __module.unet.up_blocks.0.attentions.0.transformer_blocks.4.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 257 __module.unet.up_blocks.0.attentions.0.transformer_blocks.5.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 258 __module.unet.up_blocks.0.attentions.0.transformer_blocks.5.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 259 __module.unet.up_blocks.0.attentions.0.transformer_blocks.6.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 260 __module.unet.up_blocks.0.attentions.0.transformer_blocks.6.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 261 __module.unet.up_blocks.0.attentions.0.transformer_blocks.7.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 262 __module.unet.up_blocks.0.attentions.0.transformer_blocks.7.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 263 __module.unet.up_blocks.0.attentions.0.transformer_blocks.8.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 264 __module.unet.up_blocks.0.attentions.0.transformer_blocks.8.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 265 __module.unet.up_blocks.0.attentions.0.transformer_blocks.9.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 266 __module.unet.up_blocks.0.attentions.0.transformer_blocks.9.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 267 __module.unet.up_blocks.0.attentions.1.transformer_blocks.0.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 268 __module.unet.up_blocks.0.attentions.1.transformer_blocks.0.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 269 __module.unet.up_blocks.0.attentions.1.transformer_blocks.1.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 270 __module.unet.up_blocks.0.attentions.1.transformer_blocks.1.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 271 __module.unet.up_blocks.0.attentions.1.transformer_blocks.2.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 272 __module.unet.up_blocks.0.attentions.1.transformer_blocks.2.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 273 __module.unet.up_blocks.0.attentions.1.transformer_blocks.3.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 274 __module.unet.up_blocks.0.attentions.1.transformer_blocks.3.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 275 __module.unet.up_blocks.0.attentions.1.transformer_blocks.4.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 276 __module.unet.up_blocks.0.attentions.1.transformer_blocks.4.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 277 __module.unet.up_blocks.0.attentions.1.transformer_blocks.5.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 278 __module.unet.up_blocks.0.attentions.1.transformer_blocks.5.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 279 __module.unet.up_blocks.0.attentions.1.transformer_blocks.6.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 280 __module.unet.up_blocks.0.attentions.1.transformer_blocks.6.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 281 __module.unet.up_blocks.0.attentions.1.transformer_blocks.7.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 282 __module.unet.up_blocks.0.attentions.1.transformer_blocks.7.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 283 __module.unet.up_blocks.0.attentions.1.transformer_blocks.8.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 284 __module.unet.up_blocks.0.attentions.1.transformer_blocks.8.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 285 __module.unet.up_blocks.0.attentions.1.transformer_blocks.9.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 286 __module.unet.up_blocks.0.attentions.1.transformer_blocks.9.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 287 __module.unet.up_blocks.0.attentions.2.transformer_blocks.0.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 288 __module.unet.up_blocks.0.attentions.2.transformer_blocks.0.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 289 __module.unet.up_blocks.0.attentions.2.transformer_blocks.1.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 290 __module.unet.up_blocks.0.attentions.2.transformer_blocks.1.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 291 __module.unet.up_blocks.0.attentions.2.transformer_blocks.2.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 292 __module.unet.up_blocks.0.attentions.2.transformer_blocks.2.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 293 __module.unet.up_blocks.0.attentions.2.transformer_blocks.3.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 294 __module.unet.up_blocks.0.attentions.2.transformer_blocks.3.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 295 __module.unet.up_blocks.0.attentions.2.transformer_blocks.4.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 296 __module.unet.up_blocks.0.attentions.2.transformer_blocks.4.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 297 __module.unet.up_blocks.0.attentions.2.transformer_blocks.5.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 298 __module.unet.up_blocks.0.attentions.2.transformer_blocks.5.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 299 __module.unet.up_blocks.0.attentions.2.transformer_blocks.6.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 300 __module.unet.up_blocks.0.attentions.2.transformer_blocks.6.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 301 __module.unet.up_blocks.0.attentions.2.transformer_blocks.7.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 302 __module.unet.up_blocks.0.attentions.2.transformer_blocks.7.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 303 __module.unet.up_blocks.0.attentions.2.transformer_blocks.8.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 304 __module.unet.up_blocks.0.attentions.2.transformer_blocks.8.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 305 __module.unet.up_blocks.0.attentions.2.transformer_blocks.9.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 306 __module.unet.up_blocks.0.attentions.2.transformer_blocks.9.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 307 __module.unet.up_blocks.1.attentions.0.transformer_blocks.0.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 308 __module.unet.up_blocks.1.attentions.0.transformer_blocks.0.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 309 __module.unet.up_blocks.1.attentions.0.transformer_blocks.1.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 310 __module.unet.up_blocks.1.attentions.0.transformer_blocks.1.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 311 __module.unet.up_blocks.1.attentions.1.transformer_blocks.0.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 312 __module.unet.up_blocks.1.attentions.1.transformer_blocks.0.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 313 __module.unet.up_blocks.1.attentions.1.transformer_blocks.1.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 314 __module.unet.up_blocks.1.attentions.1.transformer_blocks.1.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 315 __module.unet.up_blocks.1.attentions.2.transformer_blocks.0.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 316 __module.unet.up_blocks.1.attentions.2.transformer_blocks.0.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 317 __module.unet.up_blocks.1.attentions.2.transformer_blocks.1.attn2.processor.to_k_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 318 __module.unet.up_blocks.1.attentions.2.transformer_blocks.1.attn2.processor.to_v_ip/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 39 __module.unet.down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 40 __module.unet.down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 41 __module.unet.down_blocks.1.attentions.0.transformer_blocks.1.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 42 __module.unet.down_blocks.1.attentions.0.transformer_blocks.1.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 43 __module.unet.down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 44 __module.unet.down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 45 __module.unet.down_blocks.1.attentions.1.transformer_blocks.1.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 46 __module.unet.down_blocks.1.attentions.1.transformer_blocks.1.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 47 __module.unet.down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 48 __module.unet.down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 49 __module.unet.down_blocks.2.attentions.0.transformer_blocks.1.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 50 __module.unet.down_blocks.2.attentions.0.transformer_blocks.1.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 51 __module.unet.down_blocks.2.attentions.0.transformer_blocks.2.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 52 __module.unet.down_blocks.2.attentions.0.transformer_blocks.2.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 53 __module.unet.down_blocks.2.attentions.0.transformer_blocks.3.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 54 __module.unet.down_blocks.2.attentions.0.transformer_blocks.3.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 55 __module.unet.down_blocks.2.attentions.0.transformer_blocks.4.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 56 __module.unet.down_blocks.2.attentions.0.transformer_blocks.4.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 57 __module.unet.down_blocks.2.attentions.0.transformer_blocks.5.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 58 __module.unet.down_blocks.2.attentions.0.transformer_blocks.5.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 59 __module.unet.down_blocks.2.attentions.0.transformer_blocks.6.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 60 __module.unet.down_blocks.2.attentions.0.transformer_blocks.6.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 61 __module.unet.down_blocks.2.attentions.0.transformer_blocks.7.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 62 __module.unet.down_blocks.2.attentions.0.transformer_blocks.7.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 63 __module.unet.down_blocks.2.attentions.0.transformer_blocks.8.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 64 __module.unet.down_blocks.2.attentions.0.transformer_blocks.8.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 65 __module.unet.down_blocks.2.attentions.0.transformer_blocks.9.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 66 __module.unet.down_blocks.2.attentions.0.transformer_blocks.9.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 67 __module.unet.down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 68 __module.unet.down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 69 __module.unet.down_blocks.2.attentions.1.transformer_blocks.1.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 70 __module.unet.down_blocks.2.attentions.1.transformer_blocks.1.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 71 __module.unet.down_blocks.2.attentions.1.transformer_blocks.2.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 72 __module.unet.down_blocks.2.attentions.1.transformer_blocks.2.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 73 __module.unet.down_blocks.2.attentions.1.transformer_blocks.3.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 74 __module.unet.down_blocks.2.attentions.1.transformer_blocks.3.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 75 __module.unet.down_blocks.2.attentions.1.transformer_blocks.4.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 76 __module.unet.down_blocks.2.attentions.1.transformer_blocks.4.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 77 __module.unet.down_blocks.2.attentions.1.transformer_blocks.5.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 78 __module.unet.down_blocks.2.attentions.1.transformer_blocks.5.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 79 __module.unet.down_blocks.2.attentions.1.transformer_blocks.6.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 80 __module.unet.down_blocks.2.attentions.1.transformer_blocks.6.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 81 __module.unet.down_blocks.2.attentions.1.transformer_blocks.7.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 82 __module.unet.down_blocks.2.attentions.1.transformer_blocks.7.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 83 __module.unet.down_blocks.2.attentions.1.transformer_blocks.8.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 84 __module.unet.down_blocks.2.attentions.1.transformer_blocks.8.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 85 __module.unet.down_blocks.2.attentions.1.transformer_blocks.9.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 86 __module.unet.down_blocks.2.attentions.1.transformer_blocks.9.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 87 __module.unet.mid_block.attentions.0.transformer_blocks.0.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 88 __module.unet.mid_block.attentions.0.transformer_blocks.0.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 89 __module.unet.mid_block.attentions.0.transformer_blocks.1.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 90 __module.unet.mid_block.attentions.0.transformer_blocks.1.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 91 __module.unet.mid_block.attentions.0.transformer_blocks.2.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 92 __module.unet.mid_block.attentions.0.transformer_blocks.2.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 93 __module.unet.mid_block.attentions.0.transformer_blocks.3.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 94 __module.unet.mid_block.attentions.0.transformer_blocks.3.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 95 __module.unet.mid_block.attentions.0.transformer_blocks.4.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 96 __module.unet.mid_block.attentions.0.transformer_blocks.4.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 97 __module.unet.mid_block.attentions.0.transformer_blocks.5.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 98 __module.unet.mid_block.attentions.0.transformer_blocks.5.attn2.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 99 __module.unet.mid_block.attentions.0.transformer_blocks.6.attn2.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1809 __module.unet.time_embedding.linear_1/aten::linear/MatMul - 1928 __module.unet.time_embedding.linear_1/aten::linear/Add - 2048 __module.unet.conv_act/aten::silu/Swish_1 - - INFO:nncf:Not adding activation input quantizer for operation: 2174 __module.unet.time_embedding.linear_2/aten::linear/MatMul - 2368 __module.unet.time_embedding.linear_2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 329 __module.unet.add_embedding.linear_1/aten::linear/MatMul - 646 __module.unet.add_embedding.linear_1/aten::linear/Add - 1032 __module.unet.conv_act/aten::silu/Swish_2 - - INFO:nncf:Not adding activation input quantizer for operation: 1268 __module.unet.add_embedding.linear_2/aten::linear/MatMul - 1450 __module.unet.add_embedding.linear_2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 1904 __module.unet.down_blocks.0.resnets.0.time_emb_proj/aten::linear/MatMul - 2021 __module.unet.down_blocks.0.resnets.0.time_emb_proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 1905 __module.unet.down_blocks.0.resnets.1.time_emb_proj/aten::linear/MatMul - 2022 __module.unet.down_blocks.0.resnets.1.time_emb_proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 1906 __module.unet.down_blocks.1.resnets.0.time_emb_proj/aten::linear/MatMul - 2023 __module.unet.down_blocks.1.resnets.0.time_emb_proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 1907 __module.unet.down_blocks.1.resnets.1.time_emb_proj/aten::linear/MatMul - 2024 __module.unet.down_blocks.1.resnets.1.time_emb_proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 1908 __module.unet.down_blocks.2.resnets.0.time_emb_proj/aten::linear/MatMul - 2025 __module.unet.down_blocks.2.resnets.0.time_emb_proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 1909 __module.unet.down_blocks.2.resnets.1.time_emb_proj/aten::linear/MatMul - 2026 __module.unet.down_blocks.2.resnets.1.time_emb_proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 1910 __module.unet.mid_block.resnets.0.time_emb_proj/aten::linear/MatMul - 2027 __module.unet.mid_block.resnets.0.time_emb_proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 1911 __module.unet.mid_block.resnets.1.time_emb_proj/aten::linear/MatMul - 2028 __module.unet.mid_block.resnets.1.time_emb_proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 1912 __module.unet.up_blocks.0.resnets.0.time_emb_proj/aten::linear/MatMul - 2029 __module.unet.up_blocks.0.resnets.0.time_emb_proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 1913 __module.unet.up_blocks.0.resnets.1.time_emb_proj/aten::linear/MatMul - 2030 __module.unet.up_blocks.0.resnets.1.time_emb_proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 1914 __module.unet.up_blocks.0.resnets.2.time_emb_proj/aten::linear/MatMul - 2031 __module.unet.up_blocks.0.resnets.2.time_emb_proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 1915 __module.unet.up_blocks.1.resnets.0.time_emb_proj/aten::linear/MatMul - 2032 __module.unet.up_blocks.1.resnets.0.time_emb_proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 1916 __module.unet.up_blocks.1.resnets.1.time_emb_proj/aten::linear/MatMul - 2033 __module.unet.up_blocks.1.resnets.1.time_emb_proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 1917 __module.unet.up_blocks.1.resnets.2.time_emb_proj/aten::linear/MatMul - 2034 __module.unet.up_blocks.1.resnets.2.time_emb_proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 1918 __module.unet.up_blocks.2.resnets.0.time_emb_proj/aten::linear/MatMul - 2035 __module.unet.up_blocks.2.resnets.0.time_emb_proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 1919 __module.unet.up_blocks.2.resnets.1.time_emb_proj/aten::linear/MatMul - 2036 __module.unet.up_blocks.2.resnets.1.time_emb_proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 1920 __module.unet.up_blocks.2.resnets.2.time_emb_proj/aten::linear/MatMul - 2037 __module.unet.up_blocks.2.resnets.2.time_emb_proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3065 __module.unet.down_blocks.1.attentions.0.proj_in/aten::linear/MatMul - 3410 __module.unet.down_blocks.1.attentions.0.proj_in/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4876 __module.unet.down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 4877 __module.unet.down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 4878 __module.unet.down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5281 __module.unet.down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.0/aten::linear/MatMul - 5289 __module.unet.down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 5099 __module.unet.down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1810 __module.unet.down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.0/aten::linear/MatMul - 1929 __module.unet.down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2839 __module.unet.down_blocks.1.attentions.0.transformer_blocks.0.ff.net.0.proj/aten::linear/MatMul - 3072 __module.unet.down_blocks.1.attentions.0.transformer_blocks.0.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4276 __module.unet.down_blocks.1.attentions.0.transformer_blocks.0.ff.net.2/aten::linear/MatMul - 4588 __module.unet.down_blocks.1.attentions.0.transformer_blocks.0.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3073 __module.unet.down_blocks.1.attentions.0.transformer_blocks.1.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3074 __module.unet.down_blocks.1.attentions.0.transformer_blocks.1.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3075 __module.unet.down_blocks.1.attentions.0.transformer_blocks.1.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5103 __module.unet.down_blocks.1.attentions.0.transformer_blocks.1.attn1.to_out.0/aten::linear/MatMul - 5181 __module.unet.down_blocks.1.attentions.0.transformer_blocks.1.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3420 __module.unet.down_blocks.1.attentions.0.transformer_blocks.1.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1811 __module.unet.down_blocks.1.attentions.0.transformer_blocks.1.attn2.to_out.0/aten::linear/MatMul - 1930 __module.unet.down_blocks.1.attentions.0.transformer_blocks.1.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2842 __module.unet.down_blocks.1.attentions.0.transformer_blocks.1.ff.net.0.proj/aten::linear/MatMul - 3077 __module.unet.down_blocks.1.attentions.0.transformer_blocks.1.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4280 __module.unet.down_blocks.1.attentions.0.transformer_blocks.1.ff.net.2/aten::linear/MatMul - 4592 __module.unet.down_blocks.1.attentions.0.transformer_blocks.1.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2373 __module.unet.down_blocks.1.attentions.0.proj_out/aten::linear/MatMul - 2602 __module.unet.down_blocks.1.attentions.0.proj_out/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3836 __module.unet.down_blocks.1.attentions.1.proj_in/aten::linear/MatMul - 4269 __module.unet.down_blocks.1.attentions.1.proj_in/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 5176 __module.unet.down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5177 __module.unet.down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5178 __module.unet.down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5297 __module.unet.down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.0/aten::linear/MatMul - 5306 __module.unet.down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 5255 __module.unet.down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1812 __module.unet.down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.0/aten::linear/MatMul - 1931 __module.unet.down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2843 __module.unet.down_blocks.1.attentions.1.transformer_blocks.0.ff.net.0.proj/aten::linear/MatMul - 3078 __module.unet.down_blocks.1.attentions.1.transformer_blocks.0.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4283 __module.unet.down_blocks.1.attentions.1.transformer_blocks.0.ff.net.2/aten::linear/MatMul - 4595 __module.unet.down_blocks.1.attentions.1.transformer_blocks.0.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3079 __module.unet.down_blocks.1.attentions.1.transformer_blocks.1.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3080 __module.unet.down_blocks.1.attentions.1.transformer_blocks.1.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3081 __module.unet.down_blocks.1.attentions.1.transformer_blocks.1.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5104 __module.unet.down_blocks.1.attentions.1.transformer_blocks.1.attn1.to_out.0/aten::linear/MatMul - 5182 __module.unet.down_blocks.1.attentions.1.transformer_blocks.1.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3428 __module.unet.down_blocks.1.attentions.1.transformer_blocks.1.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1813 __module.unet.down_blocks.1.attentions.1.transformer_blocks.1.attn2.to_out.0/aten::linear/MatMul - 1932 __module.unet.down_blocks.1.attentions.1.transformer_blocks.1.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2846 __module.unet.down_blocks.1.attentions.1.transformer_blocks.1.ff.net.0.proj/aten::linear/MatMul - 3083 __module.unet.down_blocks.1.attentions.1.transformer_blocks.1.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4287 __module.unet.down_blocks.1.attentions.1.transformer_blocks.1.ff.net.2/aten::linear/MatMul - 4599 __module.unet.down_blocks.1.attentions.1.transformer_blocks.1.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2378 __module.unet.down_blocks.1.attentions.1.proj_out/aten::linear/MatMul - 2607 __module.unet.down_blocks.1.attentions.1.proj_out/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 5256 __module.unet.down_blocks.2.attentions.0.proj_in/aten::linear/MatMul - 5272 __module.unet.down_blocks.2.attentions.0.proj_in/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 5298 __module.unet.down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5299 __module.unet.down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5300 __module.unet.down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5343 __module.unet.down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.0/aten::linear/MatMul - 5346 __module.unet.down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 5310 __module.unet.down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1814 __module.unet.down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.0/aten::linear/MatMul - 1933 __module.unet.down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2847 __module.unet.down_blocks.2.attentions.0.transformer_blocks.0.ff.net.0.proj/aten::linear/MatMul - 3084 __module.unet.down_blocks.2.attentions.0.transformer_blocks.0.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4290 __module.unet.down_blocks.2.attentions.0.transformer_blocks.0.ff.net.2/aten::linear/MatMul - 4602 __module.unet.down_blocks.2.attentions.0.transformer_blocks.0.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3085 __module.unet.down_blocks.2.attentions.0.transformer_blocks.1.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3086 __module.unet.down_blocks.2.attentions.0.transformer_blocks.1.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3087 __module.unet.down_blocks.2.attentions.0.transformer_blocks.1.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5105 __module.unet.down_blocks.2.attentions.0.transformer_blocks.1.attn1.to_out.0/aten::linear/MatMul - 5183 __module.unet.down_blocks.2.attentions.0.transformer_blocks.1.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3436 __module.unet.down_blocks.2.attentions.0.transformer_blocks.1.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1815 __module.unet.down_blocks.2.attentions.0.transformer_blocks.1.attn2.to_out.0/aten::linear/MatMul - 1934 __module.unet.down_blocks.2.attentions.0.transformer_blocks.1.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2850 __module.unet.down_blocks.2.attentions.0.transformer_blocks.1.ff.net.0.proj/aten::linear/MatMul - 3089 __module.unet.down_blocks.2.attentions.0.transformer_blocks.1.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4294 __module.unet.down_blocks.2.attentions.0.transformer_blocks.1.ff.net.2/aten::linear/MatMul - 4606 __module.unet.down_blocks.2.attentions.0.transformer_blocks.1.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3090 __module.unet.down_blocks.2.attentions.0.transformer_blocks.2.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3091 __module.unet.down_blocks.2.attentions.0.transformer_blocks.2.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3092 __module.unet.down_blocks.2.attentions.0.transformer_blocks.2.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5106 __module.unet.down_blocks.2.attentions.0.transformer_blocks.2.attn1.to_out.0/aten::linear/MatMul - 5184 __module.unet.down_blocks.2.attentions.0.transformer_blocks.2.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3442 __module.unet.down_blocks.2.attentions.0.transformer_blocks.2.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1816 __module.unet.down_blocks.2.attentions.0.transformer_blocks.2.attn2.to_out.0/aten::linear/MatMul - 1935 __module.unet.down_blocks.2.attentions.0.transformer_blocks.2.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2853 __module.unet.down_blocks.2.attentions.0.transformer_blocks.2.ff.net.0.proj/aten::linear/MatMul - 3094 __module.unet.down_blocks.2.attentions.0.transformer_blocks.2.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4298 __module.unet.down_blocks.2.attentions.0.transformer_blocks.2.ff.net.2/aten::linear/MatMul - 4610 __module.unet.down_blocks.2.attentions.0.transformer_blocks.2.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3095 __module.unet.down_blocks.2.attentions.0.transformer_blocks.3.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3096 __module.unet.down_blocks.2.attentions.0.transformer_blocks.3.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3097 __module.unet.down_blocks.2.attentions.0.transformer_blocks.3.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5107 __module.unet.down_blocks.2.attentions.0.transformer_blocks.3.attn1.to_out.0/aten::linear/MatMul - 5185 __module.unet.down_blocks.2.attentions.0.transformer_blocks.3.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3448 __module.unet.down_blocks.2.attentions.0.transformer_blocks.3.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1817 __module.unet.down_blocks.2.attentions.0.transformer_blocks.3.attn2.to_out.0/aten::linear/MatMul - 1936 __module.unet.down_blocks.2.attentions.0.transformer_blocks.3.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2856 __module.unet.down_blocks.2.attentions.0.transformer_blocks.3.ff.net.0.proj/aten::linear/MatMul - 3099 __module.unet.down_blocks.2.attentions.0.transformer_blocks.3.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4302 __module.unet.down_blocks.2.attentions.0.transformer_blocks.3.ff.net.2/aten::linear/MatMul - 4614 __module.unet.down_blocks.2.attentions.0.transformer_blocks.3.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3100 __module.unet.down_blocks.2.attentions.0.transformer_blocks.4.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3101 __module.unet.down_blocks.2.attentions.0.transformer_blocks.4.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3102 __module.unet.down_blocks.2.attentions.0.transformer_blocks.4.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5108 __module.unet.down_blocks.2.attentions.0.transformer_blocks.4.attn1.to_out.0/aten::linear/MatMul - 5186 __module.unet.down_blocks.2.attentions.0.transformer_blocks.4.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3454 __module.unet.down_blocks.2.attentions.0.transformer_blocks.4.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1818 __module.unet.down_blocks.2.attentions.0.transformer_blocks.4.attn2.to_out.0/aten::linear/MatMul - 1937 __module.unet.down_blocks.2.attentions.0.transformer_blocks.4.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2859 __module.unet.down_blocks.2.attentions.0.transformer_blocks.4.ff.net.0.proj/aten::linear/MatMul - 3104 __module.unet.down_blocks.2.attentions.0.transformer_blocks.4.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4306 __module.unet.down_blocks.2.attentions.0.transformer_blocks.4.ff.net.2/aten::linear/MatMul - 4618 __module.unet.down_blocks.2.attentions.0.transformer_blocks.4.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3105 __module.unet.down_blocks.2.attentions.0.transformer_blocks.5.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3106 __module.unet.down_blocks.2.attentions.0.transformer_blocks.5.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3107 __module.unet.down_blocks.2.attentions.0.transformer_blocks.5.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5109 __module.unet.down_blocks.2.attentions.0.transformer_blocks.5.attn1.to_out.0/aten::linear/MatMul - 5187 __module.unet.down_blocks.2.attentions.0.transformer_blocks.5.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3460 __module.unet.down_blocks.2.attentions.0.transformer_blocks.5.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1819 __module.unet.down_blocks.2.attentions.0.transformer_blocks.5.attn2.to_out.0/aten::linear/MatMul - 1938 __module.unet.down_blocks.2.attentions.0.transformer_blocks.5.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2862 __module.unet.down_blocks.2.attentions.0.transformer_blocks.5.ff.net.0.proj/aten::linear/MatMul - 3109 __module.unet.down_blocks.2.attentions.0.transformer_blocks.5.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4310 __module.unet.down_blocks.2.attentions.0.transformer_blocks.5.ff.net.2/aten::linear/MatMul - 4622 __module.unet.down_blocks.2.attentions.0.transformer_blocks.5.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3110 __module.unet.down_blocks.2.attentions.0.transformer_blocks.6.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3111 __module.unet.down_blocks.2.attentions.0.transformer_blocks.6.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3112 __module.unet.down_blocks.2.attentions.0.transformer_blocks.6.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5110 __module.unet.down_blocks.2.attentions.0.transformer_blocks.6.attn1.to_out.0/aten::linear/MatMul - 5188 __module.unet.down_blocks.2.attentions.0.transformer_blocks.6.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3466 __module.unet.down_blocks.2.attentions.0.transformer_blocks.6.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1820 __module.unet.down_blocks.2.attentions.0.transformer_blocks.6.attn2.to_out.0/aten::linear/MatMul - 1939 __module.unet.down_blocks.2.attentions.0.transformer_blocks.6.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2865 __module.unet.down_blocks.2.attentions.0.transformer_blocks.6.ff.net.0.proj/aten::linear/MatMul - 3114 __module.unet.down_blocks.2.attentions.0.transformer_blocks.6.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4314 __module.unet.down_blocks.2.attentions.0.transformer_blocks.6.ff.net.2/aten::linear/MatMul - 4626 __module.unet.down_blocks.2.attentions.0.transformer_blocks.6.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3115 __module.unet.down_blocks.2.attentions.0.transformer_blocks.7.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3116 __module.unet.down_blocks.2.attentions.0.transformer_blocks.7.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3117 __module.unet.down_blocks.2.attentions.0.transformer_blocks.7.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5111 __module.unet.down_blocks.2.attentions.0.transformer_blocks.7.attn1.to_out.0/aten::linear/MatMul - 5189 __module.unet.down_blocks.2.attentions.0.transformer_blocks.7.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3472 __module.unet.down_blocks.2.attentions.0.transformer_blocks.7.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1821 __module.unet.down_blocks.2.attentions.0.transformer_blocks.7.attn2.to_out.0/aten::linear/MatMul - 1940 __module.unet.down_blocks.2.attentions.0.transformer_blocks.7.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2868 __module.unet.down_blocks.2.attentions.0.transformer_blocks.7.ff.net.0.proj/aten::linear/MatMul - 3119 __module.unet.down_blocks.2.attentions.0.transformer_blocks.7.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4318 __module.unet.down_blocks.2.attentions.0.transformer_blocks.7.ff.net.2/aten::linear/MatMul - 4630 __module.unet.down_blocks.2.attentions.0.transformer_blocks.7.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3120 __module.unet.down_blocks.2.attentions.0.transformer_blocks.8.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3121 __module.unet.down_blocks.2.attentions.0.transformer_blocks.8.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3122 __module.unet.down_blocks.2.attentions.0.transformer_blocks.8.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5112 __module.unet.down_blocks.2.attentions.0.transformer_blocks.8.attn1.to_out.0/aten::linear/MatMul - 5190 __module.unet.down_blocks.2.attentions.0.transformer_blocks.8.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3478 __module.unet.down_blocks.2.attentions.0.transformer_blocks.8.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1822 __module.unet.down_blocks.2.attentions.0.transformer_blocks.8.attn2.to_out.0/aten::linear/MatMul - 1941 __module.unet.down_blocks.2.attentions.0.transformer_blocks.8.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2871 __module.unet.down_blocks.2.attentions.0.transformer_blocks.8.ff.net.0.proj/aten::linear/MatMul - 3124 __module.unet.down_blocks.2.attentions.0.transformer_blocks.8.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4322 __module.unet.down_blocks.2.attentions.0.transformer_blocks.8.ff.net.2/aten::linear/MatMul - 4634 __module.unet.down_blocks.2.attentions.0.transformer_blocks.8.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3125 __module.unet.down_blocks.2.attentions.0.transformer_blocks.9.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3126 __module.unet.down_blocks.2.attentions.0.transformer_blocks.9.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3127 __module.unet.down_blocks.2.attentions.0.transformer_blocks.9.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5113 __module.unet.down_blocks.2.attentions.0.transformer_blocks.9.attn1.to_out.0/aten::linear/MatMul - 5191 __module.unet.down_blocks.2.attentions.0.transformer_blocks.9.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3484 __module.unet.down_blocks.2.attentions.0.transformer_blocks.9.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1823 __module.unet.down_blocks.2.attentions.0.transformer_blocks.9.attn2.to_out.0/aten::linear/MatMul - 1942 __module.unet.down_blocks.2.attentions.0.transformer_blocks.9.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2874 __module.unet.down_blocks.2.attentions.0.transformer_blocks.9.ff.net.0.proj/aten::linear/MatMul - 3129 __module.unet.down_blocks.2.attentions.0.transformer_blocks.9.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4326 __module.unet.down_blocks.2.attentions.0.transformer_blocks.9.ff.net.2/aten::linear/MatMul - 4638 __module.unet.down_blocks.2.attentions.0.transformer_blocks.9.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2407 __module.unet.down_blocks.2.attentions.0.proj_out/aten::linear/MatMul - 2636 __module.unet.down_blocks.2.attentions.0.proj_out/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 5273 __module.unet.down_blocks.2.attentions.1.proj_in/aten::linear/MatMul - 5279 __module.unet.down_blocks.2.attentions.1.proj_in/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 5311 __module.unet.down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5312 __module.unet.down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5313 __module.unet.down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5347 __module.unet.down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.0/aten::linear/MatMul - 5349 __module.unet.down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 5325 __module.unet.down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1824 __module.unet.down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.0/aten::linear/MatMul - 1943 __module.unet.down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2876 __module.unet.down_blocks.2.attentions.1.transformer_blocks.0.ff.net.0.proj/aten::linear/MatMul - 3131 __module.unet.down_blocks.2.attentions.1.transformer_blocks.0.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4334 __module.unet.down_blocks.2.attentions.1.transformer_blocks.0.ff.net.2/aten::linear/MatMul - 4645 __module.unet.down_blocks.2.attentions.1.transformer_blocks.0.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3132 __module.unet.down_blocks.2.attentions.1.transformer_blocks.1.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3133 __module.unet.down_blocks.2.attentions.1.transformer_blocks.1.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3134 __module.unet.down_blocks.2.attentions.1.transformer_blocks.1.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5116 __module.unet.down_blocks.2.attentions.1.transformer_blocks.1.attn1.to_out.0/aten::linear/MatMul - 5194 __module.unet.down_blocks.2.attentions.1.transformer_blocks.1.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3493 __module.unet.down_blocks.2.attentions.1.transformer_blocks.1.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1825 __module.unet.down_blocks.2.attentions.1.transformer_blocks.1.attn2.to_out.0/aten::linear/MatMul - 1944 __module.unet.down_blocks.2.attentions.1.transformer_blocks.1.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2879 __module.unet.down_blocks.2.attentions.1.transformer_blocks.1.ff.net.0.proj/aten::linear/MatMul - 3136 __module.unet.down_blocks.2.attentions.1.transformer_blocks.1.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4338 __module.unet.down_blocks.2.attentions.1.transformer_blocks.1.ff.net.2/aten::linear/MatMul - 4649 __module.unet.down_blocks.2.attentions.1.transformer_blocks.1.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3137 __module.unet.down_blocks.2.attentions.1.transformer_blocks.2.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3138 __module.unet.down_blocks.2.attentions.1.transformer_blocks.2.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3139 __module.unet.down_blocks.2.attentions.1.transformer_blocks.2.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5117 __module.unet.down_blocks.2.attentions.1.transformer_blocks.2.attn1.to_out.0/aten::linear/MatMul - 5195 __module.unet.down_blocks.2.attentions.1.transformer_blocks.2.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3499 __module.unet.down_blocks.2.attentions.1.transformer_blocks.2.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1826 __module.unet.down_blocks.2.attentions.1.transformer_blocks.2.attn2.to_out.0/aten::linear/MatMul - 1945 __module.unet.down_blocks.2.attentions.1.transformer_blocks.2.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2882 __module.unet.down_blocks.2.attentions.1.transformer_blocks.2.ff.net.0.proj/aten::linear/MatMul - 3141 __module.unet.down_blocks.2.attentions.1.transformer_blocks.2.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4342 __module.unet.down_blocks.2.attentions.1.transformer_blocks.2.ff.net.2/aten::linear/MatMul - 4653 __module.unet.down_blocks.2.attentions.1.transformer_blocks.2.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3142 __module.unet.down_blocks.2.attentions.1.transformer_blocks.3.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3143 __module.unet.down_blocks.2.attentions.1.transformer_blocks.3.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3144 __module.unet.down_blocks.2.attentions.1.transformer_blocks.3.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5118 __module.unet.down_blocks.2.attentions.1.transformer_blocks.3.attn1.to_out.0/aten::linear/MatMul - 5196 __module.unet.down_blocks.2.attentions.1.transformer_blocks.3.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3505 __module.unet.down_blocks.2.attentions.1.transformer_blocks.3.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1827 __module.unet.down_blocks.2.attentions.1.transformer_blocks.3.attn2.to_out.0/aten::linear/MatMul - 1946 __module.unet.down_blocks.2.attentions.1.transformer_blocks.3.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2885 __module.unet.down_blocks.2.attentions.1.transformer_blocks.3.ff.net.0.proj/aten::linear/MatMul - 3146 __module.unet.down_blocks.2.attentions.1.transformer_blocks.3.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4346 __module.unet.down_blocks.2.attentions.1.transformer_blocks.3.ff.net.2/aten::linear/MatMul - 4657 __module.unet.down_blocks.2.attentions.1.transformer_blocks.3.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3147 __module.unet.down_blocks.2.attentions.1.transformer_blocks.4.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3148 __module.unet.down_blocks.2.attentions.1.transformer_blocks.4.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3149 __module.unet.down_blocks.2.attentions.1.transformer_blocks.4.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5119 __module.unet.down_blocks.2.attentions.1.transformer_blocks.4.attn1.to_out.0/aten::linear/MatMul - 5197 __module.unet.down_blocks.2.attentions.1.transformer_blocks.4.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3511 __module.unet.down_blocks.2.attentions.1.transformer_blocks.4.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1828 __module.unet.down_blocks.2.attentions.1.transformer_blocks.4.attn2.to_out.0/aten::linear/MatMul - 1947 __module.unet.down_blocks.2.attentions.1.transformer_blocks.4.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2888 __module.unet.down_blocks.2.attentions.1.transformer_blocks.4.ff.net.0.proj/aten::linear/MatMul - 3151 __module.unet.down_blocks.2.attentions.1.transformer_blocks.4.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4350 __module.unet.down_blocks.2.attentions.1.transformer_blocks.4.ff.net.2/aten::linear/MatMul - 4661 __module.unet.down_blocks.2.attentions.1.transformer_blocks.4.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3152 __module.unet.down_blocks.2.attentions.1.transformer_blocks.5.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3153 __module.unet.down_blocks.2.attentions.1.transformer_blocks.5.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3154 __module.unet.down_blocks.2.attentions.1.transformer_blocks.5.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5120 __module.unet.down_blocks.2.attentions.1.transformer_blocks.5.attn1.to_out.0/aten::linear/MatMul - 5198 __module.unet.down_blocks.2.attentions.1.transformer_blocks.5.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3517 __module.unet.down_blocks.2.attentions.1.transformer_blocks.5.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1829 __module.unet.down_blocks.2.attentions.1.transformer_blocks.5.attn2.to_out.0/aten::linear/MatMul - 1948 __module.unet.down_blocks.2.attentions.1.transformer_blocks.5.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2891 __module.unet.down_blocks.2.attentions.1.transformer_blocks.5.ff.net.0.proj/aten::linear/MatMul - 3156 __module.unet.down_blocks.2.attentions.1.transformer_blocks.5.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4354 __module.unet.down_blocks.2.attentions.1.transformer_blocks.5.ff.net.2/aten::linear/MatMul - 4665 __module.unet.down_blocks.2.attentions.1.transformer_blocks.5.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3157 __module.unet.down_blocks.2.attentions.1.transformer_blocks.6.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3158 __module.unet.down_blocks.2.attentions.1.transformer_blocks.6.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3159 __module.unet.down_blocks.2.attentions.1.transformer_blocks.6.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5121 __module.unet.down_blocks.2.attentions.1.transformer_blocks.6.attn1.to_out.0/aten::linear/MatMul - 5199 __module.unet.down_blocks.2.attentions.1.transformer_blocks.6.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3523 __module.unet.down_blocks.2.attentions.1.transformer_blocks.6.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1830 __module.unet.down_blocks.2.attentions.1.transformer_blocks.6.attn2.to_out.0/aten::linear/MatMul - 1949 __module.unet.down_blocks.2.attentions.1.transformer_blocks.6.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2894 __module.unet.down_blocks.2.attentions.1.transformer_blocks.6.ff.net.0.proj/aten::linear/MatMul - 3161 __module.unet.down_blocks.2.attentions.1.transformer_blocks.6.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4358 __module.unet.down_blocks.2.attentions.1.transformer_blocks.6.ff.net.2/aten::linear/MatMul - 4669 __module.unet.down_blocks.2.attentions.1.transformer_blocks.6.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3162 __module.unet.down_blocks.2.attentions.1.transformer_blocks.7.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3163 __module.unet.down_blocks.2.attentions.1.transformer_blocks.7.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3164 __module.unet.down_blocks.2.attentions.1.transformer_blocks.7.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5122 __module.unet.down_blocks.2.attentions.1.transformer_blocks.7.attn1.to_out.0/aten::linear/MatMul - 5200 __module.unet.down_blocks.2.attentions.1.transformer_blocks.7.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3529 __module.unet.down_blocks.2.attentions.1.transformer_blocks.7.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1831 __module.unet.down_blocks.2.attentions.1.transformer_blocks.7.attn2.to_out.0/aten::linear/MatMul - 1950 __module.unet.down_blocks.2.attentions.1.transformer_blocks.7.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2897 __module.unet.down_blocks.2.attentions.1.transformer_blocks.7.ff.net.0.proj/aten::linear/MatMul - 3166 __module.unet.down_blocks.2.attentions.1.transformer_blocks.7.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4362 __module.unet.down_blocks.2.attentions.1.transformer_blocks.7.ff.net.2/aten::linear/MatMul - 4673 __module.unet.down_blocks.2.attentions.1.transformer_blocks.7.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3167 __module.unet.down_blocks.2.attentions.1.transformer_blocks.8.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3168 __module.unet.down_blocks.2.attentions.1.transformer_blocks.8.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3169 __module.unet.down_blocks.2.attentions.1.transformer_blocks.8.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5123 __module.unet.down_blocks.2.attentions.1.transformer_blocks.8.attn1.to_out.0/aten::linear/MatMul - 5201 __module.unet.down_blocks.2.attentions.1.transformer_blocks.8.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3535 __module.unet.down_blocks.2.attentions.1.transformer_blocks.8.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1832 __module.unet.down_blocks.2.attentions.1.transformer_blocks.8.attn2.to_out.0/aten::linear/MatMul - 1951 __module.unet.down_blocks.2.attentions.1.transformer_blocks.8.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2900 __module.unet.down_blocks.2.attentions.1.transformer_blocks.8.ff.net.0.proj/aten::linear/MatMul - 3171 __module.unet.down_blocks.2.attentions.1.transformer_blocks.8.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4366 __module.unet.down_blocks.2.attentions.1.transformer_blocks.8.ff.net.2/aten::linear/MatMul - 4677 __module.unet.down_blocks.2.attentions.1.transformer_blocks.8.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3172 __module.unet.down_blocks.2.attentions.1.transformer_blocks.9.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3173 __module.unet.down_blocks.2.attentions.1.transformer_blocks.9.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3174 __module.unet.down_blocks.2.attentions.1.transformer_blocks.9.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5124 __module.unet.down_blocks.2.attentions.1.transformer_blocks.9.attn1.to_out.0/aten::linear/MatMul - 5202 __module.unet.down_blocks.2.attentions.1.transformer_blocks.9.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3541 __module.unet.down_blocks.2.attentions.1.transformer_blocks.9.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1833 __module.unet.down_blocks.2.attentions.1.transformer_blocks.9.attn2.to_out.0/aten::linear/MatMul - 1952 __module.unet.down_blocks.2.attentions.1.transformer_blocks.9.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2903 __module.unet.down_blocks.2.attentions.1.transformer_blocks.9.ff.net.0.proj/aten::linear/MatMul - 3176 __module.unet.down_blocks.2.attentions.1.transformer_blocks.9.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4370 __module.unet.down_blocks.2.attentions.1.transformer_blocks.9.ff.net.2/aten::linear/MatMul - 4681 __module.unet.down_blocks.2.attentions.1.transformer_blocks.9.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2436 __module.unet.down_blocks.2.attentions.1.proj_out/aten::linear/MatMul - 2665 __module.unet.down_blocks.2.attentions.1.proj_out/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 5274 __module.unet.mid_block.attentions.0.proj_in/aten::linear/MatMul - 5280 __module.unet.mid_block.attentions.0.proj_in/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 5315 __module.unet.mid_block.attentions.0.transformer_blocks.0.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5316 __module.unet.mid_block.attentions.0.transformer_blocks.0.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5317 __module.unet.mid_block.attentions.0.transformer_blocks.0.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5348 __module.unet.mid_block.attentions.0.transformer_blocks.0.attn1.to_out.0/aten::linear/MatMul - 5350 __module.unet.mid_block.attentions.0.transformer_blocks.0.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 5329 __module.unet.mid_block.attentions.0.transformer_blocks.0.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1834 __module.unet.mid_block.attentions.0.transformer_blocks.0.attn2.to_out.0/aten::linear/MatMul - 1953 __module.unet.mid_block.attentions.0.transformer_blocks.0.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2905 __module.unet.mid_block.attentions.0.transformer_blocks.0.ff.net.0.proj/aten::linear/MatMul - 3178 __module.unet.mid_block.attentions.0.transformer_blocks.0.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4378 __module.unet.mid_block.attentions.0.transformer_blocks.0.ff.net.2/aten::linear/MatMul - 4688 __module.unet.mid_block.attentions.0.transformer_blocks.0.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3179 __module.unet.mid_block.attentions.0.transformer_blocks.1.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3180 __module.unet.mid_block.attentions.0.transformer_blocks.1.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3181 __module.unet.mid_block.attentions.0.transformer_blocks.1.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5127 __module.unet.mid_block.attentions.0.transformer_blocks.1.attn1.to_out.0/aten::linear/MatMul - 5205 __module.unet.mid_block.attentions.0.transformer_blocks.1.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3550 __module.unet.mid_block.attentions.0.transformer_blocks.1.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1835 __module.unet.mid_block.attentions.0.transformer_blocks.1.attn2.to_out.0/aten::linear/MatMul - 1954 __module.unet.mid_block.attentions.0.transformer_blocks.1.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2908 __module.unet.mid_block.attentions.0.transformer_blocks.1.ff.net.0.proj/aten::linear/MatMul - 3183 __module.unet.mid_block.attentions.0.transformer_blocks.1.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4382 __module.unet.mid_block.attentions.0.transformer_blocks.1.ff.net.2/aten::linear/MatMul - 4692 __module.unet.mid_block.attentions.0.transformer_blocks.1.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3184 __module.unet.mid_block.attentions.0.transformer_blocks.2.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3185 __module.unet.mid_block.attentions.0.transformer_blocks.2.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3186 __module.unet.mid_block.attentions.0.transformer_blocks.2.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5128 __module.unet.mid_block.attentions.0.transformer_blocks.2.attn1.to_out.0/aten::linear/MatMul - 5206 __module.unet.mid_block.attentions.0.transformer_blocks.2.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3556 __module.unet.mid_block.attentions.0.transformer_blocks.2.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1836 __module.unet.mid_block.attentions.0.transformer_blocks.2.attn2.to_out.0/aten::linear/MatMul - 1955 __module.unet.mid_block.attentions.0.transformer_blocks.2.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2911 __module.unet.mid_block.attentions.0.transformer_blocks.2.ff.net.0.proj/aten::linear/MatMul - 3188 __module.unet.mid_block.attentions.0.transformer_blocks.2.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4386 __module.unet.mid_block.attentions.0.transformer_blocks.2.ff.net.2/aten::linear/MatMul - 4696 __module.unet.mid_block.attentions.0.transformer_blocks.2.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3189 __module.unet.mid_block.attentions.0.transformer_blocks.3.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3190 __module.unet.mid_block.attentions.0.transformer_blocks.3.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3191 __module.unet.mid_block.attentions.0.transformer_blocks.3.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5129 __module.unet.mid_block.attentions.0.transformer_blocks.3.attn1.to_out.0/aten::linear/MatMul - 5207 __module.unet.mid_block.attentions.0.transformer_blocks.3.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3562 __module.unet.mid_block.attentions.0.transformer_blocks.3.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1837 __module.unet.mid_block.attentions.0.transformer_blocks.3.attn2.to_out.0/aten::linear/MatMul - 1956 __module.unet.mid_block.attentions.0.transformer_blocks.3.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2914 __module.unet.mid_block.attentions.0.transformer_blocks.3.ff.net.0.proj/aten::linear/MatMul - 3193 __module.unet.mid_block.attentions.0.transformer_blocks.3.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4390 __module.unet.mid_block.attentions.0.transformer_blocks.3.ff.net.2/aten::linear/MatMul - 4700 __module.unet.mid_block.attentions.0.transformer_blocks.3.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3194 __module.unet.mid_block.attentions.0.transformer_blocks.4.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3195 __module.unet.mid_block.attentions.0.transformer_blocks.4.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3196 __module.unet.mid_block.attentions.0.transformer_blocks.4.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5130 __module.unet.mid_block.attentions.0.transformer_blocks.4.attn1.to_out.0/aten::linear/MatMul - 5208 __module.unet.mid_block.attentions.0.transformer_blocks.4.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3568 __module.unet.mid_block.attentions.0.transformer_blocks.4.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1838 __module.unet.mid_block.attentions.0.transformer_blocks.4.attn2.to_out.0/aten::linear/MatMul - 1957 __module.unet.mid_block.attentions.0.transformer_blocks.4.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2917 __module.unet.mid_block.attentions.0.transformer_blocks.4.ff.net.0.proj/aten::linear/MatMul - 3198 __module.unet.mid_block.attentions.0.transformer_blocks.4.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4394 __module.unet.mid_block.attentions.0.transformer_blocks.4.ff.net.2/aten::linear/MatMul - 4704 __module.unet.mid_block.attentions.0.transformer_blocks.4.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3199 __module.unet.mid_block.attentions.0.transformer_blocks.5.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3200 __module.unet.mid_block.attentions.0.transformer_blocks.5.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3201 __module.unet.mid_block.attentions.0.transformer_blocks.5.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5131 __module.unet.mid_block.attentions.0.transformer_blocks.5.attn1.to_out.0/aten::linear/MatMul - 5209 __module.unet.mid_block.attentions.0.transformer_blocks.5.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3574 __module.unet.mid_block.attentions.0.transformer_blocks.5.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1839 __module.unet.mid_block.attentions.0.transformer_blocks.5.attn2.to_out.0/aten::linear/MatMul - 1958 __module.unet.mid_block.attentions.0.transformer_blocks.5.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2920 __module.unet.mid_block.attentions.0.transformer_blocks.5.ff.net.0.proj/aten::linear/MatMul - 3203 __module.unet.mid_block.attentions.0.transformer_blocks.5.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4398 __module.unet.mid_block.attentions.0.transformer_blocks.5.ff.net.2/aten::linear/MatMul - 4708 __module.unet.mid_block.attentions.0.transformer_blocks.5.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3204 __module.unet.mid_block.attentions.0.transformer_blocks.6.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3205 __module.unet.mid_block.attentions.0.transformer_blocks.6.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3206 __module.unet.mid_block.attentions.0.transformer_blocks.6.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5132 __module.unet.mid_block.attentions.0.transformer_blocks.6.attn1.to_out.0/aten::linear/MatMul - 5210 __module.unet.mid_block.attentions.0.transformer_blocks.6.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3580 __module.unet.mid_block.attentions.0.transformer_blocks.6.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1840 __module.unet.mid_block.attentions.0.transformer_blocks.6.attn2.to_out.0/aten::linear/MatMul - 1959 __module.unet.mid_block.attentions.0.transformer_blocks.6.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2923 __module.unet.mid_block.attentions.0.transformer_blocks.6.ff.net.0.proj/aten::linear/MatMul - 3208 __module.unet.mid_block.attentions.0.transformer_blocks.6.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4402 __module.unet.mid_block.attentions.0.transformer_blocks.6.ff.net.2/aten::linear/MatMul - 4712 __module.unet.mid_block.attentions.0.transformer_blocks.6.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3209 __module.unet.mid_block.attentions.0.transformer_blocks.7.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3210 __module.unet.mid_block.attentions.0.transformer_blocks.7.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3211 __module.unet.mid_block.attentions.0.transformer_blocks.7.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5133 __module.unet.mid_block.attentions.0.transformer_blocks.7.attn1.to_out.0/aten::linear/MatMul - 5211 __module.unet.mid_block.attentions.0.transformer_blocks.7.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3586 __module.unet.mid_block.attentions.0.transformer_blocks.7.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1841 __module.unet.mid_block.attentions.0.transformer_blocks.7.attn2.to_out.0/aten::linear/MatMul - 1960 __module.unet.mid_block.attentions.0.transformer_blocks.7.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2926 __module.unet.mid_block.attentions.0.transformer_blocks.7.ff.net.0.proj/aten::linear/MatMul - 3213 __module.unet.mid_block.attentions.0.transformer_blocks.7.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4406 __module.unet.mid_block.attentions.0.transformer_blocks.7.ff.net.2/aten::linear/MatMul - 4716 __module.unet.mid_block.attentions.0.transformer_blocks.7.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3214 __module.unet.mid_block.attentions.0.transformer_blocks.8.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3215 __module.unet.mid_block.attentions.0.transformer_blocks.8.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3216 __module.unet.mid_block.attentions.0.transformer_blocks.8.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5134 __module.unet.mid_block.attentions.0.transformer_blocks.8.attn1.to_out.0/aten::linear/MatMul - 5212 __module.unet.mid_block.attentions.0.transformer_blocks.8.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3592 __module.unet.mid_block.attentions.0.transformer_blocks.8.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1842 __module.unet.mid_block.attentions.0.transformer_blocks.8.attn2.to_out.0/aten::linear/MatMul - 1961 __module.unet.mid_block.attentions.0.transformer_blocks.8.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2929 __module.unet.mid_block.attentions.0.transformer_blocks.8.ff.net.0.proj/aten::linear/MatMul - 3218 __module.unet.mid_block.attentions.0.transformer_blocks.8.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4410 __module.unet.mid_block.attentions.0.transformer_blocks.8.ff.net.2/aten::linear/MatMul - 4720 __module.unet.mid_block.attentions.0.transformer_blocks.8.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3219 __module.unet.mid_block.attentions.0.transformer_blocks.9.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3220 __module.unet.mid_block.attentions.0.transformer_blocks.9.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3221 __module.unet.mid_block.attentions.0.transformer_blocks.9.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5135 __module.unet.mid_block.attentions.0.transformer_blocks.9.attn1.to_out.0/aten::linear/MatMul - 5213 __module.unet.mid_block.attentions.0.transformer_blocks.9.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3598 __module.unet.mid_block.attentions.0.transformer_blocks.9.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1843 __module.unet.mid_block.attentions.0.transformer_blocks.9.attn2.to_out.0/aten::linear/MatMul - 1962 __module.unet.mid_block.attentions.0.transformer_blocks.9.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2932 __module.unet.mid_block.attentions.0.transformer_blocks.9.ff.net.0.proj/aten::linear/MatMul - 3223 __module.unet.mid_block.attentions.0.transformer_blocks.9.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4414 __module.unet.mid_block.attentions.0.transformer_blocks.9.ff.net.2/aten::linear/MatMul - 4724 __module.unet.mid_block.attentions.0.transformer_blocks.9.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2465 __module.unet.mid_block.attentions.0.proj_out/aten::linear/MatMul - 2694 __module.unet.mid_block.attentions.0.proj_out/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2337 __module.unet.up_blocks.0.attentions.0.proj_in/aten::linear/MatMul - 2582 __module.unet.up_blocks.0.attentions.0.proj_in/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3822 __module.unet.up_blocks.0.attentions.0.transformer_blocks.0.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3823 __module.unet.up_blocks.0.attentions.0.transformer_blocks.0.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3824 __module.unet.up_blocks.0.attentions.0.transformer_blocks.0.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5250 __module.unet.up_blocks.0.attentions.0.transformer_blocks.0.attn1.to_out.0/aten::linear/MatMul - 5267 __module.unet.up_blocks.0.attentions.0.transformer_blocks.0.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4260 __module.unet.up_blocks.0.attentions.0.transformer_blocks.0.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1844 __module.unet.up_blocks.0.attentions.0.transformer_blocks.0.attn2.to_out.0/aten::linear/MatMul - 1963 __module.unet.up_blocks.0.attentions.0.transformer_blocks.0.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2934 __module.unet.up_blocks.0.attentions.0.transformer_blocks.0.ff.net.0.proj/aten::linear/MatMul - 3225 __module.unet.up_blocks.0.attentions.0.transformer_blocks.0.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4419 __module.unet.up_blocks.0.attentions.0.transformer_blocks.0.ff.net.2/aten::linear/MatMul - 4728 __module.unet.up_blocks.0.attentions.0.transformer_blocks.0.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3226 __module.unet.up_blocks.0.attentions.0.transformer_blocks.1.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3227 __module.unet.up_blocks.0.attentions.0.transformer_blocks.1.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3228 __module.unet.up_blocks.0.attentions.0.transformer_blocks.1.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5137 __module.unet.up_blocks.0.attentions.0.transformer_blocks.1.attn1.to_out.0/aten::linear/MatMul - 5215 __module.unet.up_blocks.0.attentions.0.transformer_blocks.1.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3607 __module.unet.up_blocks.0.attentions.0.transformer_blocks.1.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1845 __module.unet.up_blocks.0.attentions.0.transformer_blocks.1.attn2.to_out.0/aten::linear/MatMul - 1964 __module.unet.up_blocks.0.attentions.0.transformer_blocks.1.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2937 __module.unet.up_blocks.0.attentions.0.transformer_blocks.1.ff.net.0.proj/aten::linear/MatMul - 3230 __module.unet.up_blocks.0.attentions.0.transformer_blocks.1.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4423 __module.unet.up_blocks.0.attentions.0.transformer_blocks.1.ff.net.2/aten::linear/MatMul - 4732 __module.unet.up_blocks.0.attentions.0.transformer_blocks.1.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3231 __module.unet.up_blocks.0.attentions.0.transformer_blocks.2.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3232 __module.unet.up_blocks.0.attentions.0.transformer_blocks.2.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3233 __module.unet.up_blocks.0.attentions.0.transformer_blocks.2.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5138 __module.unet.up_blocks.0.attentions.0.transformer_blocks.2.attn1.to_out.0/aten::linear/MatMul - 5216 __module.unet.up_blocks.0.attentions.0.transformer_blocks.2.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3613 __module.unet.up_blocks.0.attentions.0.transformer_blocks.2.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1846 __module.unet.up_blocks.0.attentions.0.transformer_blocks.2.attn2.to_out.0/aten::linear/MatMul - 1965 __module.unet.up_blocks.0.attentions.0.transformer_blocks.2.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2940 __module.unet.up_blocks.0.attentions.0.transformer_blocks.2.ff.net.0.proj/aten::linear/MatMul - 3235 __module.unet.up_blocks.0.attentions.0.transformer_blocks.2.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4427 __module.unet.up_blocks.0.attentions.0.transformer_blocks.2.ff.net.2/aten::linear/MatMul - 4736 __module.unet.up_blocks.0.attentions.0.transformer_blocks.2.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3236 __module.unet.up_blocks.0.attentions.0.transformer_blocks.3.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3237 __module.unet.up_blocks.0.attentions.0.transformer_blocks.3.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3238 __module.unet.up_blocks.0.attentions.0.transformer_blocks.3.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5139 __module.unet.up_blocks.0.attentions.0.transformer_blocks.3.attn1.to_out.0/aten::linear/MatMul - 5217 __module.unet.up_blocks.0.attentions.0.transformer_blocks.3.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3619 __module.unet.up_blocks.0.attentions.0.transformer_blocks.3.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1847 __module.unet.up_blocks.0.attentions.0.transformer_blocks.3.attn2.to_out.0/aten::linear/MatMul - 1966 __module.unet.up_blocks.0.attentions.0.transformer_blocks.3.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2943 __module.unet.up_blocks.0.attentions.0.transformer_blocks.3.ff.net.0.proj/aten::linear/MatMul - 3240 __module.unet.up_blocks.0.attentions.0.transformer_blocks.3.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4431 __module.unet.up_blocks.0.attentions.0.transformer_blocks.3.ff.net.2/aten::linear/MatMul - 4740 __module.unet.up_blocks.0.attentions.0.transformer_blocks.3.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3241 __module.unet.up_blocks.0.attentions.0.transformer_blocks.4.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3242 __module.unet.up_blocks.0.attentions.0.transformer_blocks.4.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3243 __module.unet.up_blocks.0.attentions.0.transformer_blocks.4.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5140 __module.unet.up_blocks.0.attentions.0.transformer_blocks.4.attn1.to_out.0/aten::linear/MatMul - 5218 __module.unet.up_blocks.0.attentions.0.transformer_blocks.4.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3625 __module.unet.up_blocks.0.attentions.0.transformer_blocks.4.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1848 __module.unet.up_blocks.0.attentions.0.transformer_blocks.4.attn2.to_out.0/aten::linear/MatMul - 1967 __module.unet.up_blocks.0.attentions.0.transformer_blocks.4.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2946 __module.unet.up_blocks.0.attentions.0.transformer_blocks.4.ff.net.0.proj/aten::linear/MatMul - 3245 __module.unet.up_blocks.0.attentions.0.transformer_blocks.4.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4435 __module.unet.up_blocks.0.attentions.0.transformer_blocks.4.ff.net.2/aten::linear/MatMul - 4744 __module.unet.up_blocks.0.attentions.0.transformer_blocks.4.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3246 __module.unet.up_blocks.0.attentions.0.transformer_blocks.5.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3247 __module.unet.up_blocks.0.attentions.0.transformer_blocks.5.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3248 __module.unet.up_blocks.0.attentions.0.transformer_blocks.5.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5141 __module.unet.up_blocks.0.attentions.0.transformer_blocks.5.attn1.to_out.0/aten::linear/MatMul - 5219 __module.unet.up_blocks.0.attentions.0.transformer_blocks.5.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3631 __module.unet.up_blocks.0.attentions.0.transformer_blocks.5.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1849 __module.unet.up_blocks.0.attentions.0.transformer_blocks.5.attn2.to_out.0/aten::linear/MatMul - 1968 __module.unet.up_blocks.0.attentions.0.transformer_blocks.5.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2949 __module.unet.up_blocks.0.attentions.0.transformer_blocks.5.ff.net.0.proj/aten::linear/MatMul - 3250 __module.unet.up_blocks.0.attentions.0.transformer_blocks.5.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4439 __module.unet.up_blocks.0.attentions.0.transformer_blocks.5.ff.net.2/aten::linear/MatMul - 4748 __module.unet.up_blocks.0.attentions.0.transformer_blocks.5.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3251 __module.unet.up_blocks.0.attentions.0.transformer_blocks.6.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3252 __module.unet.up_blocks.0.attentions.0.transformer_blocks.6.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3253 __module.unet.up_blocks.0.attentions.0.transformer_blocks.6.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5142 __module.unet.up_blocks.0.attentions.0.transformer_blocks.6.attn1.to_out.0/aten::linear/MatMul - 5220 __module.unet.up_blocks.0.attentions.0.transformer_blocks.6.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3637 __module.unet.up_blocks.0.attentions.0.transformer_blocks.6.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1850 __module.unet.up_blocks.0.attentions.0.transformer_blocks.6.attn2.to_out.0/aten::linear/MatMul - 1969 __module.unet.up_blocks.0.attentions.0.transformer_blocks.6.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2952 __module.unet.up_blocks.0.attentions.0.transformer_blocks.6.ff.net.0.proj/aten::linear/MatMul - 3255 __module.unet.up_blocks.0.attentions.0.transformer_blocks.6.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4443 __module.unet.up_blocks.0.attentions.0.transformer_blocks.6.ff.net.2/aten::linear/MatMul - 4752 __module.unet.up_blocks.0.attentions.0.transformer_blocks.6.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3256 __module.unet.up_blocks.0.attentions.0.transformer_blocks.7.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3257 __module.unet.up_blocks.0.attentions.0.transformer_blocks.7.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3258 __module.unet.up_blocks.0.attentions.0.transformer_blocks.7.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5143 __module.unet.up_blocks.0.attentions.0.transformer_blocks.7.attn1.to_out.0/aten::linear/MatMul - 5221 __module.unet.up_blocks.0.attentions.0.transformer_blocks.7.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3643 __module.unet.up_blocks.0.attentions.0.transformer_blocks.7.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1851 __module.unet.up_blocks.0.attentions.0.transformer_blocks.7.attn2.to_out.0/aten::linear/MatMul - 1970 __module.unet.up_blocks.0.attentions.0.transformer_blocks.7.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2955 __module.unet.up_blocks.0.attentions.0.transformer_blocks.7.ff.net.0.proj/aten::linear/MatMul - 3260 __module.unet.up_blocks.0.attentions.0.transformer_blocks.7.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4447 __module.unet.up_blocks.0.attentions.0.transformer_blocks.7.ff.net.2/aten::linear/MatMul - 4756 __module.unet.up_blocks.0.attentions.0.transformer_blocks.7.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3261 __module.unet.up_blocks.0.attentions.0.transformer_blocks.8.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3262 __module.unet.up_blocks.0.attentions.0.transformer_blocks.8.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3263 __module.unet.up_blocks.0.attentions.0.transformer_blocks.8.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5144 __module.unet.up_blocks.0.attentions.0.transformer_blocks.8.attn1.to_out.0/aten::linear/MatMul - 5222 __module.unet.up_blocks.0.attentions.0.transformer_blocks.8.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3649 __module.unet.up_blocks.0.attentions.0.transformer_blocks.8.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1852 __module.unet.up_blocks.0.attentions.0.transformer_blocks.8.attn2.to_out.0/aten::linear/MatMul - 1971 __module.unet.up_blocks.0.attentions.0.transformer_blocks.8.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2958 __module.unet.up_blocks.0.attentions.0.transformer_blocks.8.ff.net.0.proj/aten::linear/MatMul - 3265 __module.unet.up_blocks.0.attentions.0.transformer_blocks.8.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4451 __module.unet.up_blocks.0.attentions.0.transformer_blocks.8.ff.net.2/aten::linear/MatMul - 4760 __module.unet.up_blocks.0.attentions.0.transformer_blocks.8.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3266 __module.unet.up_blocks.0.attentions.0.transformer_blocks.9.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3267 __module.unet.up_blocks.0.attentions.0.transformer_blocks.9.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3268 __module.unet.up_blocks.0.attentions.0.transformer_blocks.9.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5145 __module.unet.up_blocks.0.attentions.0.transformer_blocks.9.attn1.to_out.0/aten::linear/MatMul - 5223 __module.unet.up_blocks.0.attentions.0.transformer_blocks.9.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3655 __module.unet.up_blocks.0.attentions.0.transformer_blocks.9.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1853 __module.unet.up_blocks.0.attentions.0.transformer_blocks.9.attn2.to_out.0/aten::linear/MatMul - 1972 __module.unet.up_blocks.0.attentions.0.transformer_blocks.9.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2961 __module.unet.up_blocks.0.attentions.0.transformer_blocks.9.ff.net.0.proj/aten::linear/MatMul - 3270 __module.unet.up_blocks.0.attentions.0.transformer_blocks.9.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4455 __module.unet.up_blocks.0.attentions.0.transformer_blocks.9.ff.net.2/aten::linear/MatMul - 4764 __module.unet.up_blocks.0.attentions.0.transformer_blocks.9.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2494 __module.unet.up_blocks.0.attentions.0.proj_out/aten::linear/MatMul - 2723 __module.unet.up_blocks.0.attentions.0.proj_out/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2334 __module.unet.up_blocks.0.attentions.1.proj_in/aten::linear/MatMul - 2580 __module.unet.up_blocks.0.attentions.1.proj_in/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3817 __module.unet.up_blocks.0.attentions.1.transformer_blocks.0.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3818 __module.unet.up_blocks.0.attentions.1.transformer_blocks.0.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3819 __module.unet.up_blocks.0.attentions.1.transformer_blocks.0.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5249 __module.unet.up_blocks.0.attentions.1.transformer_blocks.0.attn1.to_out.0/aten::linear/MatMul - 5266 __module.unet.up_blocks.0.attentions.1.transformer_blocks.0.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4256 __module.unet.up_blocks.0.attentions.1.transformer_blocks.0.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1854 __module.unet.up_blocks.0.attentions.1.transformer_blocks.0.attn2.to_out.0/aten::linear/MatMul - 1973 __module.unet.up_blocks.0.attentions.1.transformer_blocks.0.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2962 __module.unet.up_blocks.0.attentions.1.transformer_blocks.0.ff.net.0.proj/aten::linear/MatMul - 3271 __module.unet.up_blocks.0.attentions.1.transformer_blocks.0.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4458 __module.unet.up_blocks.0.attentions.1.transformer_blocks.0.ff.net.2/aten::linear/MatMul - 4767 __module.unet.up_blocks.0.attentions.1.transformer_blocks.0.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3272 __module.unet.up_blocks.0.attentions.1.transformer_blocks.1.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3273 __module.unet.up_blocks.0.attentions.1.transformer_blocks.1.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3274 __module.unet.up_blocks.0.attentions.1.transformer_blocks.1.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5146 __module.unet.up_blocks.0.attentions.1.transformer_blocks.1.attn1.to_out.0/aten::linear/MatMul - 5224 __module.unet.up_blocks.0.attentions.1.transformer_blocks.1.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3663 __module.unet.up_blocks.0.attentions.1.transformer_blocks.1.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1855 __module.unet.up_blocks.0.attentions.1.transformer_blocks.1.attn2.to_out.0/aten::linear/MatMul - 1974 __module.unet.up_blocks.0.attentions.1.transformer_blocks.1.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2965 __module.unet.up_blocks.0.attentions.1.transformer_blocks.1.ff.net.0.proj/aten::linear/MatMul - 3276 __module.unet.up_blocks.0.attentions.1.transformer_blocks.1.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4462 __module.unet.up_blocks.0.attentions.1.transformer_blocks.1.ff.net.2/aten::linear/MatMul - 4771 __module.unet.up_blocks.0.attentions.1.transformer_blocks.1.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3277 __module.unet.up_blocks.0.attentions.1.transformer_blocks.2.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3278 __module.unet.up_blocks.0.attentions.1.transformer_blocks.2.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3279 __module.unet.up_blocks.0.attentions.1.transformer_blocks.2.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5147 __module.unet.up_blocks.0.attentions.1.transformer_blocks.2.attn1.to_out.0/aten::linear/MatMul - 5225 __module.unet.up_blocks.0.attentions.1.transformer_blocks.2.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3669 __module.unet.up_blocks.0.attentions.1.transformer_blocks.2.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1856 __module.unet.up_blocks.0.attentions.1.transformer_blocks.2.attn2.to_out.0/aten::linear/MatMul - 1975 __module.unet.up_blocks.0.attentions.1.transformer_blocks.2.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2968 __module.unet.up_blocks.0.attentions.1.transformer_blocks.2.ff.net.0.proj/aten::linear/MatMul - 3281 __module.unet.up_blocks.0.attentions.1.transformer_blocks.2.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4466 __module.unet.up_blocks.0.attentions.1.transformer_blocks.2.ff.net.2/aten::linear/MatMul - 4775 __module.unet.up_blocks.0.attentions.1.transformer_blocks.2.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3282 __module.unet.up_blocks.0.attentions.1.transformer_blocks.3.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3283 __module.unet.up_blocks.0.attentions.1.transformer_blocks.3.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3284 __module.unet.up_blocks.0.attentions.1.transformer_blocks.3.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5148 __module.unet.up_blocks.0.attentions.1.transformer_blocks.3.attn1.to_out.0/aten::linear/MatMul - 5226 __module.unet.up_blocks.0.attentions.1.transformer_blocks.3.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3675 __module.unet.up_blocks.0.attentions.1.transformer_blocks.3.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1857 __module.unet.up_blocks.0.attentions.1.transformer_blocks.3.attn2.to_out.0/aten::linear/MatMul - 1976 __module.unet.up_blocks.0.attentions.1.transformer_blocks.3.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2971 __module.unet.up_blocks.0.attentions.1.transformer_blocks.3.ff.net.0.proj/aten::linear/MatMul - 3286 __module.unet.up_blocks.0.attentions.1.transformer_blocks.3.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4470 __module.unet.up_blocks.0.attentions.1.transformer_blocks.3.ff.net.2/aten::linear/MatMul - 4779 __module.unet.up_blocks.0.attentions.1.transformer_blocks.3.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3287 __module.unet.up_blocks.0.attentions.1.transformer_blocks.4.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3288 __module.unet.up_blocks.0.attentions.1.transformer_blocks.4.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3289 __module.unet.up_blocks.0.attentions.1.transformer_blocks.4.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5149 __module.unet.up_blocks.0.attentions.1.transformer_blocks.4.attn1.to_out.0/aten::linear/MatMul - 5227 __module.unet.up_blocks.0.attentions.1.transformer_blocks.4.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3681 __module.unet.up_blocks.0.attentions.1.transformer_blocks.4.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1858 __module.unet.up_blocks.0.attentions.1.transformer_blocks.4.attn2.to_out.0/aten::linear/MatMul - 1977 __module.unet.up_blocks.0.attentions.1.transformer_blocks.4.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2974 __module.unet.up_blocks.0.attentions.1.transformer_blocks.4.ff.net.0.proj/aten::linear/MatMul - 3291 __module.unet.up_blocks.0.attentions.1.transformer_blocks.4.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4474 __module.unet.up_blocks.0.attentions.1.transformer_blocks.4.ff.net.2/aten::linear/MatMul - 4783 __module.unet.up_blocks.0.attentions.1.transformer_blocks.4.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3292 __module.unet.up_blocks.0.attentions.1.transformer_blocks.5.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3293 __module.unet.up_blocks.0.attentions.1.transformer_blocks.5.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3294 __module.unet.up_blocks.0.attentions.1.transformer_blocks.5.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5150 __module.unet.up_blocks.0.attentions.1.transformer_blocks.5.attn1.to_out.0/aten::linear/MatMul - 5228 __module.unet.up_blocks.0.attentions.1.transformer_blocks.5.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3687 __module.unet.up_blocks.0.attentions.1.transformer_blocks.5.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1859 __module.unet.up_blocks.0.attentions.1.transformer_blocks.5.attn2.to_out.0/aten::linear/MatMul - 1978 __module.unet.up_blocks.0.attentions.1.transformer_blocks.5.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2977 __module.unet.up_blocks.0.attentions.1.transformer_blocks.5.ff.net.0.proj/aten::linear/MatMul - 3296 __module.unet.up_blocks.0.attentions.1.transformer_blocks.5.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4478 __module.unet.up_blocks.0.attentions.1.transformer_blocks.5.ff.net.2/aten::linear/MatMul - 4787 __module.unet.up_blocks.0.attentions.1.transformer_blocks.5.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3297 __module.unet.up_blocks.0.attentions.1.transformer_blocks.6.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3298 __module.unet.up_blocks.0.attentions.1.transformer_blocks.6.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3299 __module.unet.up_blocks.0.attentions.1.transformer_blocks.6.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5151 __module.unet.up_blocks.0.attentions.1.transformer_blocks.6.attn1.to_out.0/aten::linear/MatMul - 5229 __module.unet.up_blocks.0.attentions.1.transformer_blocks.6.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3693 __module.unet.up_blocks.0.attentions.1.transformer_blocks.6.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1860 __module.unet.up_blocks.0.attentions.1.transformer_blocks.6.attn2.to_out.0/aten::linear/MatMul - 1979 __module.unet.up_blocks.0.attentions.1.transformer_blocks.6.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2980 __module.unet.up_blocks.0.attentions.1.transformer_blocks.6.ff.net.0.proj/aten::linear/MatMul - 3301 __module.unet.up_blocks.0.attentions.1.transformer_blocks.6.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4482 __module.unet.up_blocks.0.attentions.1.transformer_blocks.6.ff.net.2/aten::linear/MatMul - 4791 __module.unet.up_blocks.0.attentions.1.transformer_blocks.6.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3302 __module.unet.up_blocks.0.attentions.1.transformer_blocks.7.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3303 __module.unet.up_blocks.0.attentions.1.transformer_blocks.7.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3304 __module.unet.up_blocks.0.attentions.1.transformer_blocks.7.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5152 __module.unet.up_blocks.0.attentions.1.transformer_blocks.7.attn1.to_out.0/aten::linear/MatMul - 5230 __module.unet.up_blocks.0.attentions.1.transformer_blocks.7.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3699 __module.unet.up_blocks.0.attentions.1.transformer_blocks.7.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1861 __module.unet.up_blocks.0.attentions.1.transformer_blocks.7.attn2.to_out.0/aten::linear/MatMul - 1980 __module.unet.up_blocks.0.attentions.1.transformer_blocks.7.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2983 __module.unet.up_blocks.0.attentions.1.transformer_blocks.7.ff.net.0.proj/aten::linear/MatMul - 3306 __module.unet.up_blocks.0.attentions.1.transformer_blocks.7.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4486 __module.unet.up_blocks.0.attentions.1.transformer_blocks.7.ff.net.2/aten::linear/MatMul - 4795 __module.unet.up_blocks.0.attentions.1.transformer_blocks.7.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3307 __module.unet.up_blocks.0.attentions.1.transformer_blocks.8.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3308 __module.unet.up_blocks.0.attentions.1.transformer_blocks.8.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3309 __module.unet.up_blocks.0.attentions.1.transformer_blocks.8.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5153 __module.unet.up_blocks.0.attentions.1.transformer_blocks.8.attn1.to_out.0/aten::linear/MatMul - 5231 __module.unet.up_blocks.0.attentions.1.transformer_blocks.8.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3705 __module.unet.up_blocks.0.attentions.1.transformer_blocks.8.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1862 __module.unet.up_blocks.0.attentions.1.transformer_blocks.8.attn2.to_out.0/aten::linear/MatMul - 1981 __module.unet.up_blocks.0.attentions.1.transformer_blocks.8.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2986 __module.unet.up_blocks.0.attentions.1.transformer_blocks.8.ff.net.0.proj/aten::linear/MatMul - 3311 __module.unet.up_blocks.0.attentions.1.transformer_blocks.8.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4490 __module.unet.up_blocks.0.attentions.1.transformer_blocks.8.ff.net.2/aten::linear/MatMul - 4799 __module.unet.up_blocks.0.attentions.1.transformer_blocks.8.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3312 __module.unet.up_blocks.0.attentions.1.transformer_blocks.9.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3313 __module.unet.up_blocks.0.attentions.1.transformer_blocks.9.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3314 __module.unet.up_blocks.0.attentions.1.transformer_blocks.9.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5154 __module.unet.up_blocks.0.attentions.1.transformer_blocks.9.attn1.to_out.0/aten::linear/MatMul - 5232 __module.unet.up_blocks.0.attentions.1.transformer_blocks.9.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3711 __module.unet.up_blocks.0.attentions.1.transformer_blocks.9.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1863 __module.unet.up_blocks.0.attentions.1.transformer_blocks.9.attn2.to_out.0/aten::linear/MatMul - 1982 __module.unet.up_blocks.0.attentions.1.transformer_blocks.9.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2989 __module.unet.up_blocks.0.attentions.1.transformer_blocks.9.ff.net.0.proj/aten::linear/MatMul - 3316 __module.unet.up_blocks.0.attentions.1.transformer_blocks.9.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4494 __module.unet.up_blocks.0.attentions.1.transformer_blocks.9.ff.net.2/aten::linear/MatMul - 4803 __module.unet.up_blocks.0.attentions.1.transformer_blocks.9.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2523 __module.unet.up_blocks.0.attentions.1.proj_out/aten::linear/MatMul - 2752 __module.unet.up_blocks.0.attentions.1.proj_out/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2331 __module.unet.up_blocks.0.attentions.2.proj_in/aten::linear/MatMul - 2578 __module.unet.up_blocks.0.attentions.2.proj_in/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3812 __module.unet.up_blocks.0.attentions.2.transformer_blocks.0.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3813 __module.unet.up_blocks.0.attentions.2.transformer_blocks.0.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3814 __module.unet.up_blocks.0.attentions.2.transformer_blocks.0.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5248 __module.unet.up_blocks.0.attentions.2.transformer_blocks.0.attn1.to_out.0/aten::linear/MatMul - 5265 __module.unet.up_blocks.0.attentions.2.transformer_blocks.0.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4252 __module.unet.up_blocks.0.attentions.2.transformer_blocks.0.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1864 __module.unet.up_blocks.0.attentions.2.transformer_blocks.0.attn2.to_out.0/aten::linear/MatMul - 1983 __module.unet.up_blocks.0.attentions.2.transformer_blocks.0.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2990 __module.unet.up_blocks.0.attentions.2.transformer_blocks.0.ff.net.0.proj/aten::linear/MatMul - 3317 __module.unet.up_blocks.0.attentions.2.transformer_blocks.0.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4497 __module.unet.up_blocks.0.attentions.2.transformer_blocks.0.ff.net.2/aten::linear/MatMul - 4806 __module.unet.up_blocks.0.attentions.2.transformer_blocks.0.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3318 __module.unet.up_blocks.0.attentions.2.transformer_blocks.1.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3319 __module.unet.up_blocks.0.attentions.2.transformer_blocks.1.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3320 __module.unet.up_blocks.0.attentions.2.transformer_blocks.1.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5155 __module.unet.up_blocks.0.attentions.2.transformer_blocks.1.attn1.to_out.0/aten::linear/MatMul - 5233 __module.unet.up_blocks.0.attentions.2.transformer_blocks.1.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3719 __module.unet.up_blocks.0.attentions.2.transformer_blocks.1.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1865 __module.unet.up_blocks.0.attentions.2.transformer_blocks.1.attn2.to_out.0/aten::linear/MatMul - 1984 __module.unet.up_blocks.0.attentions.2.transformer_blocks.1.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2993 __module.unet.up_blocks.0.attentions.2.transformer_blocks.1.ff.net.0.proj/aten::linear/MatMul - 3322 __module.unet.up_blocks.0.attentions.2.transformer_blocks.1.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4501 __module.unet.up_blocks.0.attentions.2.transformer_blocks.1.ff.net.2/aten::linear/MatMul - 4810 __module.unet.up_blocks.0.attentions.2.transformer_blocks.1.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3323 __module.unet.up_blocks.0.attentions.2.transformer_blocks.2.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3324 __module.unet.up_blocks.0.attentions.2.transformer_blocks.2.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3325 __module.unet.up_blocks.0.attentions.2.transformer_blocks.2.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5156 __module.unet.up_blocks.0.attentions.2.transformer_blocks.2.attn1.to_out.0/aten::linear/MatMul - 5234 __module.unet.up_blocks.0.attentions.2.transformer_blocks.2.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3725 __module.unet.up_blocks.0.attentions.2.transformer_blocks.2.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1866 __module.unet.up_blocks.0.attentions.2.transformer_blocks.2.attn2.to_out.0/aten::linear/MatMul - 1985 __module.unet.up_blocks.0.attentions.2.transformer_blocks.2.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2996 __module.unet.up_blocks.0.attentions.2.transformer_blocks.2.ff.net.0.proj/aten::linear/MatMul - 3327 __module.unet.up_blocks.0.attentions.2.transformer_blocks.2.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4505 __module.unet.up_blocks.0.attentions.2.transformer_blocks.2.ff.net.2/aten::linear/MatMul - 4814 __module.unet.up_blocks.0.attentions.2.transformer_blocks.2.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3328 __module.unet.up_blocks.0.attentions.2.transformer_blocks.3.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3329 __module.unet.up_blocks.0.attentions.2.transformer_blocks.3.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3330 __module.unet.up_blocks.0.attentions.2.transformer_blocks.3.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5157 __module.unet.up_blocks.0.attentions.2.transformer_blocks.3.attn1.to_out.0/aten::linear/MatMul - 5235 __module.unet.up_blocks.0.attentions.2.transformer_blocks.3.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3731 __module.unet.up_blocks.0.attentions.2.transformer_blocks.3.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1867 __module.unet.up_blocks.0.attentions.2.transformer_blocks.3.attn2.to_out.0/aten::linear/MatMul - 1986 __module.unet.up_blocks.0.attentions.2.transformer_blocks.3.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2999 __module.unet.up_blocks.0.attentions.2.transformer_blocks.3.ff.net.0.proj/aten::linear/MatMul - 3332 __module.unet.up_blocks.0.attentions.2.transformer_blocks.3.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4509 __module.unet.up_blocks.0.attentions.2.transformer_blocks.3.ff.net.2/aten::linear/MatMul - 4818 __module.unet.up_blocks.0.attentions.2.transformer_blocks.3.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3333 __module.unet.up_blocks.0.attentions.2.transformer_blocks.4.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3334 __module.unet.up_blocks.0.attentions.2.transformer_blocks.4.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3335 __module.unet.up_blocks.0.attentions.2.transformer_blocks.4.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5158 __module.unet.up_blocks.0.attentions.2.transformer_blocks.4.attn1.to_out.0/aten::linear/MatMul - 5236 __module.unet.up_blocks.0.attentions.2.transformer_blocks.4.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3737 __module.unet.up_blocks.0.attentions.2.transformer_blocks.4.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1868 __module.unet.up_blocks.0.attentions.2.transformer_blocks.4.attn2.to_out.0/aten::linear/MatMul - 1987 __module.unet.up_blocks.0.attentions.2.transformer_blocks.4.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3002 __module.unet.up_blocks.0.attentions.2.transformer_blocks.4.ff.net.0.proj/aten::linear/MatMul - 3337 __module.unet.up_blocks.0.attentions.2.transformer_blocks.4.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4513 __module.unet.up_blocks.0.attentions.2.transformer_blocks.4.ff.net.2/aten::linear/MatMul - 4822 __module.unet.up_blocks.0.attentions.2.transformer_blocks.4.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3338 __module.unet.up_blocks.0.attentions.2.transformer_blocks.5.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3339 __module.unet.up_blocks.0.attentions.2.transformer_blocks.5.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3340 __module.unet.up_blocks.0.attentions.2.transformer_blocks.5.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5159 __module.unet.up_blocks.0.attentions.2.transformer_blocks.5.attn1.to_out.0/aten::linear/MatMul - 5237 __module.unet.up_blocks.0.attentions.2.transformer_blocks.5.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3743 __module.unet.up_blocks.0.attentions.2.transformer_blocks.5.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1869 __module.unet.up_blocks.0.attentions.2.transformer_blocks.5.attn2.to_out.0/aten::linear/MatMul - 1988 __module.unet.up_blocks.0.attentions.2.transformer_blocks.5.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3005 __module.unet.up_blocks.0.attentions.2.transformer_blocks.5.ff.net.0.proj/aten::linear/MatMul - 3342 __module.unet.up_blocks.0.attentions.2.transformer_blocks.5.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4517 __module.unet.up_blocks.0.attentions.2.transformer_blocks.5.ff.net.2/aten::linear/MatMul - 4826 __module.unet.up_blocks.0.attentions.2.transformer_blocks.5.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3343 __module.unet.up_blocks.0.attentions.2.transformer_blocks.6.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3344 __module.unet.up_blocks.0.attentions.2.transformer_blocks.6.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3345 __module.unet.up_blocks.0.attentions.2.transformer_blocks.6.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5160 __module.unet.up_blocks.0.attentions.2.transformer_blocks.6.attn1.to_out.0/aten::linear/MatMul - 5238 __module.unet.up_blocks.0.attentions.2.transformer_blocks.6.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3749 __module.unet.up_blocks.0.attentions.2.transformer_blocks.6.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1870 __module.unet.up_blocks.0.attentions.2.transformer_blocks.6.attn2.to_out.0/aten::linear/MatMul - 1989 __module.unet.up_blocks.0.attentions.2.transformer_blocks.6.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3008 __module.unet.up_blocks.0.attentions.2.transformer_blocks.6.ff.net.0.proj/aten::linear/MatMul - 3347 __module.unet.up_blocks.0.attentions.2.transformer_blocks.6.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4521 __module.unet.up_blocks.0.attentions.2.transformer_blocks.6.ff.net.2/aten::linear/MatMul - 4830 __module.unet.up_blocks.0.attentions.2.transformer_blocks.6.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3348 __module.unet.up_blocks.0.attentions.2.transformer_blocks.7.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3349 __module.unet.up_blocks.0.attentions.2.transformer_blocks.7.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3350 __module.unet.up_blocks.0.attentions.2.transformer_blocks.7.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5161 __module.unet.up_blocks.0.attentions.2.transformer_blocks.7.attn1.to_out.0/aten::linear/MatMul - 5239 __module.unet.up_blocks.0.attentions.2.transformer_blocks.7.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3755 __module.unet.up_blocks.0.attentions.2.transformer_blocks.7.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1871 __module.unet.up_blocks.0.attentions.2.transformer_blocks.7.attn2.to_out.0/aten::linear/MatMul - 1990 __module.unet.up_blocks.0.attentions.2.transformer_blocks.7.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3011 __module.unet.up_blocks.0.attentions.2.transformer_blocks.7.ff.net.0.proj/aten::linear/MatMul - 3352 __module.unet.up_blocks.0.attentions.2.transformer_blocks.7.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4525 __module.unet.up_blocks.0.attentions.2.transformer_blocks.7.ff.net.2/aten::linear/MatMul - 4834 __module.unet.up_blocks.0.attentions.2.transformer_blocks.7.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3353 __module.unet.up_blocks.0.attentions.2.transformer_blocks.8.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3354 __module.unet.up_blocks.0.attentions.2.transformer_blocks.8.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3355 __module.unet.up_blocks.0.attentions.2.transformer_blocks.8.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5162 __module.unet.up_blocks.0.attentions.2.transformer_blocks.8.attn1.to_out.0/aten::linear/MatMul - 5240 __module.unet.up_blocks.0.attentions.2.transformer_blocks.8.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3761 __module.unet.up_blocks.0.attentions.2.transformer_blocks.8.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1872 __module.unet.up_blocks.0.attentions.2.transformer_blocks.8.attn2.to_out.0/aten::linear/MatMul - 1991 __module.unet.up_blocks.0.attentions.2.transformer_blocks.8.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3014 __module.unet.up_blocks.0.attentions.2.transformer_blocks.8.ff.net.0.proj/aten::linear/MatMul - 3357 __module.unet.up_blocks.0.attentions.2.transformer_blocks.8.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4529 __module.unet.up_blocks.0.attentions.2.transformer_blocks.8.ff.net.2/aten::linear/MatMul - 4838 __module.unet.up_blocks.0.attentions.2.transformer_blocks.8.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3358 __module.unet.up_blocks.0.attentions.2.transformer_blocks.9.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3359 __module.unet.up_blocks.0.attentions.2.transformer_blocks.9.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3360 __module.unet.up_blocks.0.attentions.2.transformer_blocks.9.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5163 __module.unet.up_blocks.0.attentions.2.transformer_blocks.9.attn1.to_out.0/aten::linear/MatMul - 5241 __module.unet.up_blocks.0.attentions.2.transformer_blocks.9.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3767 __module.unet.up_blocks.0.attentions.2.transformer_blocks.9.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1873 __module.unet.up_blocks.0.attentions.2.transformer_blocks.9.attn2.to_out.0/aten::linear/MatMul - 1992 __module.unet.up_blocks.0.attentions.2.transformer_blocks.9.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3017 __module.unet.up_blocks.0.attentions.2.transformer_blocks.9.ff.net.0.proj/aten::linear/MatMul - 3362 __module.unet.up_blocks.0.attentions.2.transformer_blocks.9.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4533 __module.unet.up_blocks.0.attentions.2.transformer_blocks.9.ff.net.2/aten::linear/MatMul - 4842 __module.unet.up_blocks.0.attentions.2.transformer_blocks.9.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2552 __module.unet.up_blocks.0.attentions.2.proj_out/aten::linear/MatMul - 2781 __module.unet.up_blocks.0.attentions.2.proj_out/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2328 __module.unet.up_blocks.1.attentions.0.proj_in/aten::linear/MatMul - 2576 __module.unet.up_blocks.1.attentions.0.proj_in/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3807 __module.unet.up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3808 __module.unet.up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3809 __module.unet.up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5247 __module.unet.up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.0/aten::linear/MatMul - 5264 __module.unet.up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4248 __module.unet.up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1874 __module.unet.up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.0/aten::linear/MatMul - 1993 __module.unet.up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3018 __module.unet.up_blocks.1.attentions.0.transformer_blocks.0.ff.net.0.proj/aten::linear/MatMul - 3363 __module.unet.up_blocks.1.attentions.0.transformer_blocks.0.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4536 __module.unet.up_blocks.1.attentions.0.transformer_blocks.0.ff.net.2/aten::linear/MatMul - 4845 __module.unet.up_blocks.1.attentions.0.transformer_blocks.0.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3364 __module.unet.up_blocks.1.attentions.0.transformer_blocks.1.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3365 __module.unet.up_blocks.1.attentions.0.transformer_blocks.1.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3366 __module.unet.up_blocks.1.attentions.0.transformer_blocks.1.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5164 __module.unet.up_blocks.1.attentions.0.transformer_blocks.1.attn1.to_out.0/aten::linear/MatMul - 5242 __module.unet.up_blocks.1.attentions.0.transformer_blocks.1.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3775 __module.unet.up_blocks.1.attentions.0.transformer_blocks.1.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1875 __module.unet.up_blocks.1.attentions.0.transformer_blocks.1.attn2.to_out.0/aten::linear/MatMul - 1994 __module.unet.up_blocks.1.attentions.0.transformer_blocks.1.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3021 __module.unet.up_blocks.1.attentions.0.transformer_blocks.1.ff.net.0.proj/aten::linear/MatMul - 3368 __module.unet.up_blocks.1.attentions.0.transformer_blocks.1.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4540 __module.unet.up_blocks.1.attentions.0.transformer_blocks.1.ff.net.2/aten::linear/MatMul - 4849 __module.unet.up_blocks.1.attentions.0.transformer_blocks.1.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2557 __module.unet.up_blocks.1.attentions.0.proj_out/aten::linear/MatMul - 2786 __module.unet.up_blocks.1.attentions.0.proj_out/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2325 __module.unet.up_blocks.1.attentions.1.proj_in/aten::linear/MatMul - 2574 __module.unet.up_blocks.1.attentions.1.proj_in/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3802 __module.unet.up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3803 __module.unet.up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3804 __module.unet.up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5246 __module.unet.up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.0/aten::linear/MatMul - 5263 __module.unet.up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4244 __module.unet.up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1876 __module.unet.up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.0/aten::linear/MatMul - 1995 __module.unet.up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3022 __module.unet.up_blocks.1.attentions.1.transformer_blocks.0.ff.net.0.proj/aten::linear/MatMul - 3369 __module.unet.up_blocks.1.attentions.1.transformer_blocks.0.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4543 __module.unet.up_blocks.1.attentions.1.transformer_blocks.0.ff.net.2/aten::linear/MatMul - 4852 __module.unet.up_blocks.1.attentions.1.transformer_blocks.0.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3370 __module.unet.up_blocks.1.attentions.1.transformer_blocks.1.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3371 __module.unet.up_blocks.1.attentions.1.transformer_blocks.1.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3372 __module.unet.up_blocks.1.attentions.1.transformer_blocks.1.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5165 __module.unet.up_blocks.1.attentions.1.transformer_blocks.1.attn1.to_out.0/aten::linear/MatMul - 5243 __module.unet.up_blocks.1.attentions.1.transformer_blocks.1.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3783 __module.unet.up_blocks.1.attentions.1.transformer_blocks.1.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1877 __module.unet.up_blocks.1.attentions.1.transformer_blocks.1.attn2.to_out.0/aten::linear/MatMul - 1996 __module.unet.up_blocks.1.attentions.1.transformer_blocks.1.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3025 __module.unet.up_blocks.1.attentions.1.transformer_blocks.1.ff.net.0.proj/aten::linear/MatMul - 3374 __module.unet.up_blocks.1.attentions.1.transformer_blocks.1.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4547 __module.unet.up_blocks.1.attentions.1.transformer_blocks.1.ff.net.2/aten::linear/MatMul - 4856 __module.unet.up_blocks.1.attentions.1.transformer_blocks.1.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2562 __module.unet.up_blocks.1.attentions.1.proj_out/aten::linear/MatMul - 2791 __module.unet.up_blocks.1.attentions.1.proj_out/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2322 __module.unet.up_blocks.1.attentions.2.proj_in/aten::linear/MatMul - 2572 __module.unet.up_blocks.1.attentions.2.proj_in/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3797 __module.unet.up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3798 __module.unet.up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3799 __module.unet.up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5245 __module.unet.up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_out.0/aten::linear/MatMul - 5262 __module.unet.up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4240 __module.unet.up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1878 __module.unet.up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_out.0/aten::linear/MatMul - 1997 __module.unet.up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3026 __module.unet.up_blocks.1.attentions.2.transformer_blocks.0.ff.net.0.proj/aten::linear/MatMul - 3375 __module.unet.up_blocks.1.attentions.2.transformer_blocks.0.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4550 __module.unet.up_blocks.1.attentions.2.transformer_blocks.0.ff.net.2/aten::linear/MatMul - 4859 __module.unet.up_blocks.1.attentions.2.transformer_blocks.0.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3376 __module.unet.up_blocks.1.attentions.2.transformer_blocks.1.attn1.to_k/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3377 __module.unet.up_blocks.1.attentions.2.transformer_blocks.1.attn1.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3378 __module.unet.up_blocks.1.attentions.2.transformer_blocks.1.attn1.to_v/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5166 __module.unet.up_blocks.1.attentions.2.transformer_blocks.1.attn1.to_out.0/aten::linear/MatMul - 5244 __module.unet.up_blocks.1.attentions.2.transformer_blocks.1.attn1.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3791 __module.unet.up_blocks.1.attentions.2.transformer_blocks.1.attn2.to_q/aten::linear/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 1879 __module.unet.up_blocks.1.attentions.2.transformer_blocks.1.attn2.to_out.0/aten::linear/MatMul - 1998 __module.unet.up_blocks.1.attentions.2.transformer_blocks.1.attn2.to_out.0/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 3029 __module.unet.up_blocks.1.attentions.2.transformer_blocks.1.ff.net.0.proj/aten::linear/MatMul - 3380 __module.unet.up_blocks.1.attentions.2.transformer_blocks.1.ff.net.0.proj/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 4554 __module.unet.up_blocks.1.attentions.2.transformer_blocks.1.ff.net.2/aten::linear/MatMul - 4863 __module.unet.up_blocks.1.attentions.2.transformer_blocks.1.ff.net.2/aten::linear/Add - - INFO:nncf:Not adding activation input quantizer for operation: 2567 __module.unet.up_blocks.1.attentions.2.proj_out/aten::linear/MatMul - 2796 __module.unet.up_blocks.1.attentions.2.proj_out/aten::linear/Add - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -Run Weights Compression -^^^^^^^^^^^^^^^^^^^^^^^ - - - -Quantizing of the ``Text Encoders`` and ``VAE Decoder`` does not -significantly improve inference performance but can lead to a -substantial degradation of accuracy. The weight compression will be -applied to footprint reduction. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - ov_int8_text_encoder_path = MODELS_DIR / 'text_encoder_optimized.xml' - ov_int8_text_encoder_2_path = MODELS_DIR / 'text_encoder_2_optimized.xml' - ov_int8_vae_decoder_path = MODELS_DIR / 'vae_decoder_optimized.xml' - - if not ov_int8_text_encoder_path.exists(): - text_encoder = core.read_model(ov_text_encoder_path) - compressed_text_encoder = nncf.compress_weights(text_encoder) - ov.save_model(compressed_text_encoder, ov_int8_text_encoder_path) - - if not ov_int8_text_encoder_2_path.exists(): - text_encoder_2 = core.read_model(ov_text_encoder_2_path) - compressed_text_encoder_2 = nncf.compress_weights(text_encoder_2) - ov.save_model(compressed_text_encoder_2, ov_int8_text_encoder_2_path) - - if not ov_int8_vae_decoder_path.exists(): - vae_decoder = core.read_model(ov_vae_decoder_path) - compressed_vae_decoder = nncf.compress_weights(vae_decoder) - ov.save_model(compressed_vae_decoder, ov_int8_vae_decoder_path) - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (74 / 74) │ 100% (74 / 74) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (195 / 195) │ 100% (195 / 195) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (40 / 40) │ 100% (40 / 40) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -Let’s compare the images generated by the original and optimized -pipelines. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - int8_pipe = create_ov_pipe( - ov_int8_text_encoder_path, - ov_int8_text_encoder_2_path, - ov_image_proj_encoder_path, - ov_int8_controlnet_path, - ov_int8_unet_path, - ov_int8_vae_decoder_path, - MODELS_DIR / "tokenizer", - MODELS_DIR / "tokenizer_2", - MODELS_DIR / "scheduler" - ) - -.. code:: ipython3 - - %%skip not $to_quantize.value - - int8_image = int8_pipe( - prompt, - image_embeds=face_emb, - image=face_kps, - num_inference_steps=4, - negative_prompt=negative_prompt, - guidance_scale=0.5, - generator=torch.Generator(device="cpu").manual_seed(1749781188) - ).images[0] - - - -.. parsed-literal:: - - 0%| | 0/4 [00:00 PIL.Image: - return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) - - - def convert_from_image_to_cv2(img: PIL.Image) -> np.ndarray: - return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) - - - def resize_img( - input_image, - max_side=1024, - min_side=800, - size=None, - pad_to_max_side=False, - mode=PIL.Image.BILINEAR, - base_pixel_number=64, - ): - w, h = input_image.size - if size is not None: - w_resize_new, h_resize_new = size - else: - ratio = min_side / min(h, w) - w, h = round(ratio * w), round(ratio * h) - ratio = max_side / max(h, w) - input_image = input_image.resize([round(ratio * w), round(ratio * h)], mode) - w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number - h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number - input_image = input_image.resize([w_resize_new, h_resize_new], mode) - - if pad_to_max_side: - res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255 - offset_x = (max_side - w_resize_new) // 2 - offset_y = (max_side - h_resize_new) // 2 - res[offset_y : offset_y + h_resize_new, offset_x : offset_x + w_resize_new] = np.array(input_image) - input_image = Image.fromarray(res) - return input_image - - - def apply_style(style_name: str, positive: str, negative: str = "") -> Tuple[str, str]: - p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME]) - return p.replace("{prompt}", positive), n + " " + negative - - - def generate_image( - face_image, - pose_image, - prompt, - negative_prompt, - style_name, - num_steps, - identitynet_strength_ratio, - guidance_scale, - seed, - progress=gr.Progress(track_tqdm=True), - ): - if prompt is None: - prompt = "a person" - - # apply the style template - prompt, negative_prompt = apply_style(style_name, prompt, negative_prompt) - - # face_image = load_image(face_image_path) - face_image = resize_img(face_image) - face_image_cv2 = convert_from_image_to_cv2(face_image) - height, width, _ = face_image_cv2.shape - - # Extract face features - face_info = app.get(face_image_cv2) - - if len(face_info) == 0: - raise gr.Error("Cannot find any face in the image! Please upload another person image") - - face_info = sorted( - face_info, - key=lambda x: (x["bbox"][2] - x["bbox"][0]) * x["bbox"][3] - x["bbox"][1], - )[ - -1 - ] # only use the maximum face - face_emb = face_info["embedding"] - face_kps = draw_kps(convert_from_cv2_to_image(face_image_cv2), face_info["kps"]) - - if pose_image is not None: - # pose_image = load_image(pose_image_path) - pose_image = resize_img(pose_image) - pose_image_cv2 = convert_from_image_to_cv2(pose_image) - - face_info = app.get(pose_image_cv2) - - if len(face_info) == 0: - raise gr.Error("Cannot find any face in the reference image! Please upload another person image") - - face_info = face_info[-1] - face_kps = draw_kps(pose_image, face_info["kps"]) - - width, height = face_kps.size - - generator = torch.Generator(device="cpu").manual_seed(seed) - - print("Start inference...") - print(f"[Debug] Prompt: {prompt}, \n[Debug] Neg Prompt: {negative_prompt}") - images = pipeline( - prompt=prompt, - negative_prompt=negative_prompt, - image_embeds=face_emb, - image=face_kps, - controlnet_conditioning_scale=float(identitynet_strength_ratio), - num_inference_steps=num_steps, - guidance_scale=guidance_scale, - height=height, - width=width, - generator=generator, - ).images - - return images[0] - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/instant-id/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=generate_image) - - try: - demo.launch(debug=False) - except Exception: - demo.launch(share=True, debug=False) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ - -.. code:: ipython3 - - # please uncomment and run this cell for stopping gradio interface - # demo.close() diff --git a/docs/notebooks/instant-id-with-output_files/instant-id-with-output_16_0.jpg b/docs/notebooks/instant-id-with-output_files/instant-id-with-output_16_0.jpg deleted file mode 100644 index 9204c96e2f27b5..00000000000000 --- a/docs/notebooks/instant-id-with-output_files/instant-id-with-output_16_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b8cb87ed65dd68ed3a55169231dc4b535b642c700b6f407a8fd36551ccdb1b44 -size 57403 diff --git a/docs/notebooks/instant-id-with-output_files/instant-id-with-output_16_0.png b/docs/notebooks/instant-id-with-output_files/instant-id-with-output_16_0.png deleted file mode 100644 index 75c81edf77497e..00000000000000 --- a/docs/notebooks/instant-id-with-output_files/instant-id-with-output_16_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b3aaae251bfd76143e413cba9580629e315af4a963723a4d5f2baed7d616befe -size 418498 diff --git a/docs/notebooks/instant-id-with-output_files/instant-id-with-output_17_0.jpg b/docs/notebooks/instant-id-with-output_files/instant-id-with-output_17_0.jpg deleted file mode 100644 index 3922ec0b103b57..00000000000000 --- a/docs/notebooks/instant-id-with-output_files/instant-id-with-output_17_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f84d31ac2853b19f8dc12f280bffcd132b9d5905f51aefcea3b5e9c79a5e2508 -size 13727 diff --git a/docs/notebooks/instant-id-with-output_files/instant-id-with-output_17_0.png b/docs/notebooks/instant-id-with-output_files/instant-id-with-output_17_0.png deleted file mode 100644 index 3bdbdcbd0acd60..00000000000000 --- a/docs/notebooks/instant-id-with-output_files/instant-id-with-output_17_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9cc497376f7ac254d2906f1a3be71adc8948e0c566fd50cab1b8df2709db4318 -size 10852 diff --git a/docs/notebooks/instant-id-with-output_files/instant-id-with-output_40_0.jpg b/docs/notebooks/instant-id-with-output_files/instant-id-with-output_40_0.jpg deleted file mode 100644 index 1b7b774c96bf12..00000000000000 --- a/docs/notebooks/instant-id-with-output_files/instant-id-with-output_40_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:646a654572e1fdf08b0b47c55913f169ba98aadc61a6cfdefaaaaf74e6abe523 -size 46252 diff --git a/docs/notebooks/instant-id-with-output_files/instant-id-with-output_40_0.png b/docs/notebooks/instant-id-with-output_files/instant-id-with-output_40_0.png deleted file mode 100644 index ac988a697d1f0c..00000000000000 --- a/docs/notebooks/instant-id-with-output_files/instant-id-with-output_40_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:17a0d50fbd64de6661e650d9e3b847b2756a3471ba87982c2c594dbd9a34cd60 -size 819683 diff --git a/docs/notebooks/instant-id-with-output_files/instant-id-with-output_66_0.png b/docs/notebooks/instant-id-with-output_files/instant-id-with-output_66_0.png deleted file mode 100644 index 31cfafc161f573..00000000000000 --- a/docs/notebooks/instant-id-with-output_files/instant-id-with-output_66_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5072e5c2fee17632ff07397a6f49eb7dbbbc4034f393d97660e720b3968af590 -size 2022065 diff --git a/docs/notebooks/instruct-pix2pix-image-editing-with-output.rst b/docs/notebooks/instruct-pix2pix-image-editing-with-output.rst deleted file mode 100644 index fcbd81752ec21a..00000000000000 --- a/docs/notebooks/instruct-pix2pix-image-editing-with-output.rst +++ /dev/null @@ -1,1377 +0,0 @@ -Image Editing with InstructPix2Pix and OpenVINO -=============================================== - -The InstructPix2Pix is a conditional diffusion model that edits images -based on written instructions provided by the user. Generative image -editing models traditionally target a single editing task like style -transfer or translation between image domains. Text guidance gives us an -opportunity to solve multiple tasks with a single model. The -InstructPix2Pix method works different than existing text-based image -editing in that it enables editing from instructions that tell the model -what action to perform instead of using text labels, captions or -descriptions of input/output images. A key benefit of following editing -instructions is that the user can just tell the model exactly what to do -in natural written text. There is no need for the user to provide extra -information, such as example images or descriptions of visual content -that remain constant between the input and output images. More details -about this approach can be found in this -`paper `__ and -`repository `__. - -This notebook demonstrates how to convert and run the InstructPix2Pix -model using OpenVINO. - -Notebook contains the following steps: - -1. Convert PyTorch models to OpenVINO IR format, using Model Conversion - API. -2. Run InstructPix2Pix pipeline with OpenVINO. -3. Optimize InstructPix2Pix pipeline with - `NNCF `__ quantization. -4. Compare results of original and optimized pipelines. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Create Pytorch Models pipeline <#create-pytorch-models-pipeline>`__ -- `Convert Models to OpenVINO IR <#convert-models-to-openvino-ir>`__ - - - `Text Encoder <#text-encoder>`__ - - `VAE <#vae>`__ - - `Unet <#unet>`__ - -- `Prepare Inference Pipeline <#prepare-inference-pipeline>`__ -- `Quantization <#quantization>`__ - - - `Prepare calibration dataset <#prepare-calibration-dataset>`__ - - `Run quantization <#run-quantization>`__ - - `Compare inference time of the FP16 and INT8 - models <#compare-inference-time-of-the-fp16-and-int8-models>`__ - -- `Interactive demo with Gradio <#interactive-demo-with-gradio>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Install necessary packages - -.. code:: ipython3 - - %pip install -q "transformers>=4.25.1" torch accelerate "gradio>4.19" "datasets>=2.14.6" "matplotlib>=3.4" diffusers pillow opencv-python --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "openvino>=2024.3.0" - -Create Pytorch Models pipeline ------------------------------- - - - -``StableDiffusionInstructPix2PixPipeline`` is an end-to-end inference -pipeline that you can use to edit images from text instructions with -just a few lines of code provided as part - `diffusers `__ library. - -First, we load the pre-trained weights of all components of the model. - - **NOTE**: Initially, model loading can take some time due to - downloading the weights. Also, the download speed depends on your - internet connection. - -.. code:: ipython3 - - import torch - from diffusers import ( - StableDiffusionInstructPix2PixPipeline, - EulerAncestralDiscreteScheduler, - ) - - model_id = "timbrooks/instruct-pix2pix" - pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float32, safety_checker=None) - scheduler_config = pipe.scheduler.config - text_encoder = pipe.text_encoder - text_encoder.eval() - unet = pipe.unet - unet.eval() - vae = pipe.vae - vae.eval() - - del pipe - -Convert Models to OpenVINO IR ------------------------------ - - - -OpenVINO supports PyTorch models using `Model Conversion -API `__ -to convert the model to IR format. ``ov.convert_model`` function accepts -PyTorch model object and example input and then converts it to -``ov.Model`` class instance that ready to use for loading on device or -can be saved on disk using ``ov.save_model``. - -The InstructPix2Pix model is based on Stable Diffusion, a large-scale -text-to-image latent diffusion model. You can find more details about -how to run Stable Diffusion for text-to-image generation with OpenVINO -in a separate -`tutorial `__. - -The model consists of three important parts: - -- Text Encoder - to create conditions from a text prompt. -- Unet - for step-by-step denoising latent image representation. -- Autoencoder (VAE) - to encode the initial image to latent space for - starting the denoising process and decoding latent space to image, - when denoising is complete. - -Let us convert each part. - -Text Encoder -~~~~~~~~~~~~ - - - -The text-encoder is responsible for transforming the input prompt, for -example, “a photo of an astronaut riding a horse” into an embedding -space that can be understood by the UNet. It is usually a simple -transformer-based encoder that maps a sequence of input tokens to a -sequence of latent text embeddings. - -Input of the text encoder is tensor ``input_ids``, which contains -indexes of tokens from text processed by tokenizer and padded to maximum -length accepted by the model. Model outputs are two tensors: -``last_hidden_state`` - hidden state from the last MultiHeadAttention -layer in the model and ``pooler_out`` - pooled output for whole model -hidden states. - -.. code:: ipython3 - - from pathlib import Path - import openvino as ov - import gc - - core = ov.Core() - - TEXT_ENCODER_OV_PATH = Path("text_encoder.xml") - - - def cleanup_torchscript_cache(): - """ - Helper for removing cached model representation - """ - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - - - def convert_encoder(text_encoder: torch.nn.Module, ir_path: Path): - """ - Convert Text Encoder mode. - Function accepts text encoder model, and prepares example inputs for conversion, - Parameters: - text_encoder (torch.nn.Module): text_encoder model from Stable Diffusion pipeline - ir_path (Path): File for storing model - Returns: - None - """ - input_ids = torch.ones((1, 77), dtype=torch.long) - # switch model to inference mode - text_encoder.eval() - - # disable gradients calculation for reducing memory consumption - with torch.no_grad(): - # Export model to IR format - ov_model = ov.convert_model( - text_encoder, - example_input=input_ids, - input=[ - (1, 77), - ], - ) - ov.save_model(ov_model, ir_path) - del ov_model - cleanup_torchscript_cache() - print(f"Text Encoder successfully converted to IR and saved to {ir_path}") - - - if not TEXT_ENCODER_OV_PATH.exists(): - convert_encoder(text_encoder, TEXT_ENCODER_OV_PATH) - else: - print(f"Text encoder will be loaded from {TEXT_ENCODER_OV_PATH}") - - del text_encoder - gc.collect(); - -VAE -~~~ - - - -The VAE model consists of two parts: an encoder and a decoder. - -- The encoder is used to convert the image into a low dimensional - latent representation, which will serve as the input to the UNet - model. -- The decoder, conversely, transforms the latent representation back - into an image. - -In comparison with a text-to-image inference pipeline, where VAE is used -only for decoding, the pipeline also involves the original image -encoding. As the two parts are used separately in the pipeline on -different steps, and do not depend on each other, we should convert them -into two independent models. - -.. code:: ipython3 - - VAE_ENCODER_OV_PATH = Path("vae_encoder.xml") - - - def convert_vae_encoder(vae: torch.nn.Module, ir_path: Path): - """ - Convert VAE model for encoding to IR format. - Function accepts vae model, creates wrapper class for export only necessary for inference part, - prepares example inputs for conversion, - Parameters: - vae (torch.nn.Module): VAE model from StableDiffusio pipeline - ir_path (Path): File for storing model - Returns: - None - """ - - class VAEEncoderWrapper(torch.nn.Module): - def __init__(self, vae): - super().__init__() - self.vae = vae - - def forward(self, image): - return self.vae.encode(x=image)["latent_dist"].sample() - - vae_encoder = VAEEncoderWrapper(vae) - vae_encoder.eval() - image = torch.zeros((1, 3, 512, 512)) - with torch.no_grad(): - ov_model = ov.convert_model(vae_encoder, example_input=image, input=[((1, 3, 512, 512),)]) - ov.save_model(ov_model, ir_path) - del ov_model - cleanup_torchscript_cache() - print(f"VAE encoder successfully converted to IR and saved to {ir_path}") - - - if not VAE_ENCODER_OV_PATH.exists(): - convert_vae_encoder(vae, VAE_ENCODER_OV_PATH) - else: - print(f"VAE encoder will be loaded from {VAE_ENCODER_OV_PATH}") - - VAE_DECODER_OV_PATH = Path("vae_decoder.xml") - - - def convert_vae_decoder(vae: torch.nn.Module, ir_path: Path): - """ - Convert VAE model for decoding to IR format. - Function accepts vae model, creates wrapper class for export only necessary for inference part, - prepares example inputs for conversion, - Parameters: - vae (torch.nn.Module): VAE model frm StableDiffusion pipeline - ir_path (Path): File for storing model - Returns: - None - """ - - class VAEDecoderWrapper(torch.nn.Module): - def __init__(self, vae): - super().__init__() - self.vae = vae - - def forward(self, latents): - return self.vae.decode(latents) - - vae_decoder = VAEDecoderWrapper(vae) - latents = torch.zeros((1, 4, 64, 64)) - - vae_decoder.eval() - with torch.no_grad(): - ov_model = ov.convert_model(vae_decoder, example_input=latents, input=[((1, 4, 64, 64),)]) - ov.save_model(ov_model, ir_path) - del ov_model - cleanup_torchscript_cache() - print(f"VAE decoder successfully converted to IR and saved to {ir_path}") - - - if not VAE_DECODER_OV_PATH.exists(): - convert_vae_decoder(vae, VAE_DECODER_OV_PATH) - else: - print(f"VAE decoder will be loaded from {VAE_DECODER_OV_PATH}") - - del vae - gc.collect(); - -Unet -~~~~ - - - -The Unet model has three inputs: - -- ``scaled_latent_model_input`` - the latent image sample from previous - step. Generation process has not been started yet, so you will use - random noise. -- ``timestep`` - a current scheduler step. -- ``text_embeddings`` - a hidden state of the text encoder. - -Model predicts the ``sample`` state for the next step. - -.. code:: ipython3 - - import numpy as np - - UNET_OV_PATH = Path("unet.xml") - - dtype_mapping = {torch.float32: ov.Type.f32, torch.float64: ov.Type.f64} - - - def convert_unet(unet: torch.nn.Module, ir_path: Path): - """ - Convert U-net model to IR format. - Function accepts unet model, prepares example inputs for conversion, - Parameters: - unet (StableDiffusionPipeline): unet from Stable Diffusion pipeline - ir_path (Path): File for storing model - Returns: - None - """ - # prepare inputs - encoder_hidden_state = torch.ones((3, 77, 768)) - latents_shape = (3, 8, 512 // 8, 512 // 8) - latents = torch.randn(latents_shape) - t = torch.from_numpy(np.array(1, dtype=float)) - dummy_inputs = (latents, t, encoder_hidden_state) - input_info = [] - for input_tensor in dummy_inputs: - shape = ov.PartialShape(tuple(input_tensor.shape)) - element_type = dtype_mapping[input_tensor.dtype] - input_info.append((shape, element_type)) - - unet.eval() - with torch.no_grad(): - ov_model = ov.convert_model(unet, example_input=dummy_inputs, input=input_info) - ov.save_model(ov_model, ir_path) - del ov_model - cleanup_torchscript_cache() - print(f"Unet successfully converted to IR and saved to {ir_path}") - - - if not UNET_OV_PATH.exists(): - convert_unet(unet, UNET_OV_PATH) - gc.collect() - else: - print(f"Unet will be loaded from {UNET_OV_PATH}") - del unet - gc.collect(); - -Prepare Inference Pipeline --------------------------- - - - -Putting it all together, let us now take a closer look at how the model -inference works by illustrating the logical flow. - -.. figure:: https://user-images.githubusercontent.com/29454499/214895365-3063ac11-0486-4d9b-9e25-8f469aba5e5d.png - :alt: diagram - - diagram - -The InstructPix2Pix model takes both an image and a text prompt as an -input. The image is transformed to latent image representations of size -:math:`64 \times 64`, using the encoder part of variational autoencoder, -whereas the text prompt is transformed to text embeddings of size -:math:`77 \times 768` via CLIP’s text encoder. - -Next, the UNet model iteratively *denoises* the random latent image -representations while being conditioned on the text embeddings. The -output of the UNet, being the noise residual, is used to compute a -denoised latent image representation via a scheduler algorithm. - -The *denoising* process is repeated a given number of times (by default -100) to retrieve step-by-step better latent image representations. Once -it has been completed, the latent image representation is decoded by the -decoder part of the variational auto encoder. - -.. code:: ipython3 - - from diffusers import DiffusionPipeline - from transformers import CLIPTokenizer - from typing import Union, List, Optional, Tuple - import PIL - import cv2 - - - def scale_fit_to_window(dst_width: int, dst_height: int, image_width: int, image_height: int): - """ - Preprocessing helper function for calculating image size for resize with peserving original aspect ratio - and fitting image to specific window size - - Parameters: - dst_width (int): destination window width - dst_height (int): destination window height - image_width (int): source image width - image_height (int): source image height - Returns: - result_width (int): calculated width for resize - result_height (int): calculated height for resize - """ - im_scale = min(dst_height / image_height, dst_width / image_width) - return int(im_scale * image_width), int(im_scale * image_height) - - - def preprocess(image: PIL.Image.Image): - """ - Image preprocessing function. Takes image in PIL.Image format, resizes it to keep aspect ration and fits to model input window 512x512, - then converts it to np.ndarray and adds padding with zeros on right or bottom side of image (depends from aspect ratio), after that - converts data to float32 data type and change range of values from [0, 255] to [-1, 1], finally, converts data layout from planar NHWC to NCHW. - The function returns preprocessed input tensor and padding size, which can be used in postprocessing. - - Parameters: - image (PIL.Image.Image): input image - Returns: - image (np.ndarray): preprocessed image tensor - pad (Tuple[int]): pading size for each dimension for restoring image size in postprocessing - """ - src_width, src_height = image.size - dst_width, dst_height = scale_fit_to_window(512, 512, src_width, src_height) - image = np.array(image.resize((dst_width, dst_height), resample=PIL.Image.Resampling.LANCZOS))[None, :] - pad_width = 512 - dst_width - pad_height = 512 - dst_height - pad = ((0, 0), (0, pad_height), (0, pad_width), (0, 0)) - image = np.pad(image, pad, mode="constant") - image = image.astype(np.float32) / 255.0 - image = 2.0 * image - 1.0 - image = image.transpose(0, 3, 1, 2) - return image, pad - - - def randn_tensor( - shape: Union[Tuple, List], - dtype: Optional[np.dtype] = np.float32, - ): - """ - Helper function for generation random values tensor with given shape and data type - - Parameters: - shape (Union[Tuple, List]): shape for filling random values - dtype (np.dtype, *optiona*, np.float32): data type for result - Returns: - latents (np.ndarray): tensor with random values with given data type and shape (usually represents noise in latent space) - """ - latents = np.random.randn(*shape).astype(dtype) - - return latents - - - class OVInstructPix2PixPipeline(DiffusionPipeline): - """ - OpenVINO inference pipeline for InstructPix2Pix - """ - - def __init__( - self, - tokenizer: CLIPTokenizer, - scheduler: EulerAncestralDiscreteScheduler, - core: ov.Core, - text_encoder: ov.Model, - vae_encoder: ov.Model, - unet: ov.Model, - vae_decoder: ov.Model, - device: str = "AUTO", - ): - super().__init__() - self.tokenizer = tokenizer - self.vae_scale_factor = 8 - self.scheduler = scheduler - self.load_models(core, device, text_encoder, vae_encoder, unet, vae_decoder) - - def load_models( - self, - core: ov.Core, - device: str, - text_encoder: ov.Model, - vae_encoder: ov.Model, - unet: ov.Model, - vae_decoder: ov.Model, - ): - """ - Function for loading models on device using OpenVINO - - Parameters: - core (Core): OpenVINO runtime Core class instance - device (str): inference device - text_encoder (Model): OpenVINO Model object represents text encoder - vae_encoder (Model): OpenVINO Model object represents vae encoder - unet (Model): OpenVINO Model object represents unet - vae_decoder (Model): OpenVINO Model object represents vae decoder - Returns - None - """ - self.text_encoder = core.compile_model(text_encoder, device) - self.text_encoder_out = self.text_encoder.output(0) - ov_config = {"INFERENCE_PRECISION_HINT": "f32"} if device != "CPU" else {} - self.vae_encoder = core.compile_model(vae_encoder, device, ov_config) - self.vae_encoder_out = self.vae_encoder.output(0) - # We have to register UNet in config to be able to change it externally to collect calibration data - self.register_to_config(unet=core.compile_model(unet, device)) - self.unet_out = self.unet.output(0) - self.vae_decoder = core.compile_model(vae_decoder, device, ov_config) - self.vae_decoder_out = self.vae_decoder.output(0) - - def __call__( - self, - prompt: Union[str, List[str]], - image: PIL.Image.Image, - num_inference_steps: int = 10, - guidance_scale: float = 7.5, - image_guidance_scale: float = 1.5, - eta: float = 0.0, - latents: Optional[np.array] = None, - output_type: Optional[str] = "pil", - ): - """ - Function invoked when calling the pipeline for generation. - - Parameters: - prompt (`str` or `List[str]`): - The prompt or prompts to guide the image generation. - image (`PIL.Image.Image`): - `Image`, or tensor representing an image batch which will be repainted according to `prompt`. - num_inference_steps (`int`, *optional*, defaults to 100): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (`float`, *optional*, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. This pipeline requires a value of at least `1`. - image_guidance_scale (`float`, *optional*, defaults to 1.5): - Image guidance scale is to push the generated image towards the inital image `image`. Image guidance - scale is enabled by setting `image_guidance_scale > 1`. Higher image guidance scale encourages to - generate images that are closely linked to the source image `image`, usually at the expense of lower - image quality. This pipeline requires a value of at least `1`. - latents (`torch.FloatTensor`, *optional*): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - Returns: - image ([List[Union[np.ndarray, PIL.Image.Image]]): generaited images - - """ - - # 1. Define call parameters - batch_size = 1 if isinstance(prompt, str) else len(prompt) - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 and image_guidance_scale >= 1.0 - # check if scheduler is in sigmas space - scheduler_is_in_sigma_space = hasattr(self.scheduler, "sigmas") - - # 2. Encode input prompt - text_embeddings = self._encode_prompt(prompt) - - # 3. Preprocess image - orig_width, orig_height = image.size - image, pad = preprocess(image) - height, width = image.shape[-2:] - - # 4. set timesteps - self.scheduler.set_timesteps(num_inference_steps) - timesteps = self.scheduler.timesteps - - # 5. Prepare Image latents - image_latents = self.prepare_image_latents( - image, - do_classifier_free_guidance=do_classifier_free_guidance, - ) - - # 6. Prepare latent variables - num_channels_latents = 4 - latents = self.prepare_latents( - batch_size, - num_channels_latents, - height, - width, - text_embeddings.dtype, - latents, - ) - - # 7. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - # Expand the latents if we are doing classifier free guidance. - # The latents are expanded 3 times because for pix2pix the guidance\ - # is applied for both the text and the input image. - latent_model_input = np.concatenate([latents] * 3) if do_classifier_free_guidance else latents - - # concat latents, image_latents in the channel dimension - scaled_latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - scaled_latent_model_input = np.concatenate([scaled_latent_model_input, image_latents], axis=1) - - # predict the noise residual - noise_pred = self.unet([scaled_latent_model_input, t, text_embeddings])[self.unet_out] - - # Hack: - # For karras style schedulers the model does classifier free guidance using the - # predicted_original_sample instead of the noise_pred. So we need to compute the - # predicted_original_sample here if we are using a karras style scheduler. - if scheduler_is_in_sigma_space: - step_index = (self.scheduler.timesteps == t).nonzero().item() - sigma = self.scheduler.sigmas[step_index].numpy() - noise_pred = latent_model_input - sigma * noise_pred - - # perform guidance - if do_classifier_free_guidance: - noise_pred_text, noise_pred_image, noise_pred_uncond = ( - noise_pred[0], - noise_pred[1], - noise_pred[2], - ) - noise_pred = ( - noise_pred_uncond - + guidance_scale * (noise_pred_text - noise_pred_image) - + image_guidance_scale * (noise_pred_image - noise_pred_uncond) - ) - - # For karras style schedulers the model does classifier free guidance using the - # predicted_original_sample instead of the noise_pred. But the scheduler.step function - # expects the noise_pred and computes the predicted_original_sample internally. So we - # need to overwrite the noise_pred here such that the value of the computed - # predicted_original_sample is correct. - if scheduler_is_in_sigma_space: - noise_pred = (noise_pred - latents) / (-sigma) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(torch.from_numpy(noise_pred), t, torch.from_numpy(latents)).prev_sample.numpy() - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - - # 8. Post-processing - image = self.decode_latents(latents, pad) - - # 9. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) - image = [img.resize((orig_width, orig_height), PIL.Image.Resampling.LANCZOS) for img in image] - else: - image = [cv2.resize(img, (orig_width, orig_width)) for img in image] - - return image - - def _encode_prompt( - self, - prompt: Union[str, List[str]], - num_images_per_prompt: int = 1, - do_classifier_free_guidance: bool = True, - ): - """ - Encodes the prompt into text encoder hidden states. - - Parameters: - prompt (str or list(str)): prompt to be encoded - num_images_per_prompt (int): number of images that should be generated per prompt - do_classifier_free_guidance (bool): whether to use classifier free guidance or not - Returns: - text_embeddings (np.ndarray): text encoder hidden states - """ - batch_size = len(prompt) if isinstance(prompt, list) else 1 - - # tokenize input prompts - text_inputs = self.tokenizer( - prompt, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="np", - ) - text_input_ids = text_inputs.input_ids - - text_embeddings = self.text_encoder(text_input_ids)[self.text_encoder_out] - - # duplicate text embeddings for each generation per prompt, using mps friendly method - if num_images_per_prompt != 1: - bs_embed, seq_len, _ = text_embeddings.shape - text_embeddings = np.tile(text_embeddings, (1, num_images_per_prompt, 1)) - text_embeddings = np.reshape(text_embeddings, (bs_embed * num_images_per_prompt, seq_len, -1)) - - # get unconditional embeddings for classifier free guidance - if do_classifier_free_guidance: - uncond_tokens: List[str] - uncond_tokens = [""] * batch_size - max_length = text_input_ids.shape[-1] - uncond_input = self.tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="np", - ) - - uncond_embeddings = self.text_encoder(uncond_input.input_ids)[self.text_encoder_out] - - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - seq_len = uncond_embeddings.shape[1] - uncond_embeddings = np.tile(uncond_embeddings, (1, num_images_per_prompt, 1)) - uncond_embeddings = np.reshape(uncond_embeddings, (batch_size * num_images_per_prompt, seq_len, -1)) - - # For classifier free guidance, you need to do two forward passes. - # Here, you concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - text_embeddings = np.concatenate([text_embeddings, uncond_embeddings, uncond_embeddings]) - - return text_embeddings - - def prepare_image_latents( - self, - image, - batch_size=1, - num_images_per_prompt=1, - do_classifier_free_guidance=True, - ): - """ - Encodes input image to latent space using VAE Encoder - - Parameters: - image (np.ndarray): input image tensor - num_image_per_prompt (int, *optional*, 1): number of image generated for promt - do_classifier_free_guidance (bool): whether to use classifier free guidance or not - Returns: - image_latents: image encoded to latent space - """ - - image = image.astype(np.float32) - - batch_size = batch_size * num_images_per_prompt - image_latents = self.vae_encoder(image)[self.vae_encoder_out] - - if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0: - # expand image_latents for batch_size - additional_image_per_prompt = batch_size // image_latents.shape[0] - image_latents = np.concatenate([image_latents] * additional_image_per_prompt, axis=0) - elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0: - raise ValueError(f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts.") - else: - image_latents = np.concatenate([image_latents], axis=0) - - if do_classifier_free_guidance: - uncond_image_latents = np.zeros_like(image_latents) - image_latents = np.concatenate([image_latents, image_latents, uncond_image_latents], axis=0) - - return image_latents - - def prepare_latents( - self, - batch_size: int, - num_channels_latents: int, - height: int, - width: int, - dtype: np.dtype = np.float32, - latents: np.ndarray = None, - ): - """ - Preparing noise to image generation. If initial latents are not provided, they will be generated randomly, - then prepared latents scaled by the standard deviation required by the scheduler - - Parameters: - batch_size (int): input batch size - num_channels_latents (int): number of channels for noise generation - height (int): image height - width (int): image width - dtype (np.dtype, *optional*, np.float32): dtype for latents generation - latents (np.ndarray, *optional*, None): initial latent noise tensor, if not provided will be generated - Returns: - latents (np.ndarray): scaled initial noise for diffusion - """ - shape = ( - batch_size, - num_channels_latents, - height // self.vae_scale_factor, - width // self.vae_scale_factor, - ) - if latents is None: - latents = randn_tensor(shape, dtype=dtype) - else: - latents = latents - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * self.scheduler.init_noise_sigma.numpy() - return latents - - def decode_latents(self, latents: np.array, pad: Tuple[int]): - """ - Decode predicted image from latent space using VAE Decoder and unpad image result - - Parameters: - latents (np.ndarray): image encoded in diffusion latent space - pad (Tuple[int]): each side padding sizes obtained on preprocessing step - Returns: - image: decoded by VAE decoder image - """ - latents = 1 / 0.18215 * latents - image = self.vae_decoder(latents)[self.vae_decoder_out] - (_, end_h), (_, end_w) = pad[1:3] - h, w = image.shape[2:] - unpad_h = h - end_h - unpad_w = w - end_w - image = image[:, :, :unpad_h, :unpad_w] - image = np.clip(image / 2 + 0.5, 0, 1) - image = np.transpose(image, (0, 2, 3, 1)) - return image - -.. code:: ipython3 - - import matplotlib.pyplot as plt - - - def visualize_results( - orig_img: PIL.Image.Image, - processed_img: PIL.Image.Image, - img1_title: str, - img2_title: str, - ): - """ - Helper function for results visualization - - Parameters: - orig_img (PIL.Image.Image): original image - processed_img (PIL.Image.Image): processed image after editing - img1_title (str): title for the image on the left - img2_title (str): title for the image on the right - Returns: - fig (matplotlib.pyplot.Figure): matplotlib generated figure contains drawing result - """ - im_w, im_h = orig_img.size - is_horizontal = im_h <= im_w - figsize = (20, 30) if is_horizontal else (30, 20) - fig, axs = plt.subplots( - 1 if is_horizontal else 2, - 2 if is_horizontal else 1, - figsize=figsize, - sharex="all", - sharey="all", - ) - fig.patch.set_facecolor("white") - list_axes = list(axs.flat) - for a in list_axes: - a.set_xticklabels([]) - a.set_yticklabels([]) - a.get_xaxis().set_visible(False) - a.get_yaxis().set_visible(False) - a.grid(False) - list_axes[0].imshow(np.array(orig_img)) - list_axes[1].imshow(np.array(processed_img)) - list_axes[0].set_title(img1_title, fontsize=20) - list_axes[1].set_title(img2_title, fontsize=20) - fig.subplots_adjust(wspace=0.0 if is_horizontal else 0.01, hspace=0.01 if is_horizontal else 0.0) - fig.tight_layout() - fig.savefig("result.png", bbox_inches="tight") - return fig - -Model tokenizer and scheduler are also important parts of the pipeline. -Let us define them and put all components together. Additionally, you -can provide device selecting one from available in dropdown list. - -.. code:: ipython3 - - import requests - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import device_widget - - device = device_widget() - - device - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("instruct-pix2pix-image-editing.ipynb") - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - from transformers import CLIPTokenizer - - tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") - scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler_config) - - ov_pipe = OVInstructPix2PixPipeline( - tokenizer, - scheduler, - core, - TEXT_ENCODER_OV_PATH, - VAE_ENCODER_OV_PATH, - UNET_OV_PATH, - VAE_DECODER_OV_PATH, - device=device.value, - ) - -Now, you are ready to define editing instructions and an image for -running the inference pipeline. You can find example results generated -by the model on this -`page `__, in case you -need inspiration. Optionally, you can also change the random generator -seed for latent state initialization and number of steps. - - **Note**: Consider increasing ``steps`` to get more precise results. - A suggested value is ``100``, but it will take more time to process. - -.. code:: ipython3 - - import ipywidgets as widgets - - style = {"description_width": "initial"} - text_prompt = widgets.Text(value=" Make it in galaxy", description="your text") - num_steps = widgets.IntSlider(min=1, max=100, value=10, description="steps:") - seed = widgets.IntSlider(min=0, max=1024, description="seed: ", value=42) - image_widget = widgets.FileUpload(accept="", multiple=False, description="Upload image", style=style) - widgets.VBox([text_prompt, seed, num_steps, image_widget]) - - - - -.. parsed-literal:: - - VBox(children=(Text(value=' Make it in galaxy', description='your text'), IntSlider(value=42, description='see… - - - - **Note**: Diffusion process can take some time, depending on what - hardware you select. - -.. code:: ipython3 - - import io - import requests - from diffusers.utils import load_image - - default_image_path = Path("default_image.png") - default_url = "https://user-images.githubusercontent.com/29454499/223343459-4ac944f0-502e-4acf-9813-8e9f0abc8a16.jpg" - - if not default_image_path.exists(): - img = load_image(default_url) - img.save(default_image_path) - - default_image = PIL.Image.open(default_image_path) - # read uploaded image - image = PIL.Image.open(io.BytesIO(image_widget.value[-1]["content"])) if image_widget.value else default_image - image = image.convert("RGB") - print("Pipeline settings") - print(f"Input text: {text_prompt.value}") - print(f"Seed: {seed.value}") - print(f"Number of steps: {num_steps.value}") - np.random.seed(seed.value) - processed_image = ov_pipe(text_prompt.value, image, num_steps.value) - - -.. parsed-literal:: - - Pipeline settings - Input text: Make it in galaxy - Seed: 42 - Number of steps: 10 - - - -.. parsed-literal:: - - 0%| | 0/10 [00:00`__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -According to ``InstructPix2Pix`` pipeline structure, UNet used for -iterative denoising of input. It means that model runs in the cycle -repeating inference on each diffusion step, while other parts of -pipeline take part only once. That is why computation cost and speed of -UNet denoising becomes the critical path in the pipeline. - -The optimization process contains the following steps: - -1. Create a calibration dataset for quantization. -2. Run ``nncf.quantize()`` to obtain quantized model. -3. Save the ``INT8`` model using ``openvino.save_model()`` function. - -Please select below whether you would like to run quantization to -improve model inference speed. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - to_quantize = quantization_widget() - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -Let’s load ``skip magic`` extension to skip quantization if -``to_quantize`` is not selected - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - import requests - - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - %load_ext skip_kernel_extension - -Prepare calibration dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We use a portion of -`fusing/instructpix2pix-1000-samples `__ -dataset from Hugging Face as calibration data. To collect intermediate -model inputs for calibration we should customize ``CompiledModel``. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import datasets - from tqdm.notebook import tqdm - from transformers import Pipeline - from typing import Any, Dict, List - - class CompiledModelDecorator(ov.CompiledModel): - def __init__(self, compiled_model, prob: float, data_cache: List[Any] = None): - super().__init__(compiled_model) - self.data_cache = data_cache if data_cache else [] - self.prob = np.clip(prob, 0, 1) - - def __call__(self, *args, **kwargs): - if np.random.rand() >= self.prob: - self.data_cache.append(*args) - return super().__call__(*args, **kwargs) - - def collect_calibration_data(pix2pix_pipeline: Pipeline, subset_size: int) -> List[Dict]: - original_unet = pix2pix_pipeline.unet - pix2pix_pipeline.unet = CompiledModelDecorator(original_unet, prob=0.3) - dataset = datasets.load_dataset("fusing/instructpix2pix-1000-samples", split="train", streaming=True).shuffle(seed=42) - pix2pix_pipeline.set_progress_bar_config(disable=True) - - # Run inference for data collection - pbar = tqdm(total=subset_size) - diff = 0 - for batch in dataset: - prompt = batch["edit_prompt"] - image = batch["input_image"].convert("RGB") - _ = pix2pix_pipeline(prompt, image) - collected_subset_size = len(pix2pix_pipeline.unet.data_cache) - if collected_subset_size >= subset_size: - pbar.update(subset_size - pbar.n) - break - pbar.update(collected_subset_size - diff) - diff = collected_subset_size - - calibration_dataset = pix2pix_pipeline.unet.data_cache - pix2pix_pipeline.set_progress_bar_config(disable=False) - pix2pix_pipeline.unet = original_unet - return calibration_dataset - -.. code:: ipython3 - - %%skip not $to_quantize.value - - UNET_INT8_OV_PATH = Path("unet_int8.xml") - if not UNET_INT8_OV_PATH.exists(): - subset_size = 300 - unet_calibration_data = collect_calibration_data(ov_pipe, subset_size=subset_size) - - -.. parsed-literal:: - - /home/ltalamanova/env_ci/lib/python3.8/site-packages/diffusers/configuration_utils.py:134: FutureWarning: Accessing config attribute `unet` directly via 'OVInstructPix2PixPipeline' object attribute is deprecated. Please access 'unet' over 'OVInstructPix2PixPipeline's config object instead, e.g. 'scheduler.config.unet'. - deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False) - - - -.. parsed-literal:: - - 0%| | 0/300 [00:00`__, -`blog `__ and -original `repo `__. - -In this tutorial we consider how to convert and optimize InternVL2 model -for creating multimodal chatbot. Additionally, we demonstrate how to -apply stateful transformation on LLM part and model optimization -techniques like weights compression using -`NNCF `__ - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Select model <#select-model>`__ -- `Convert and Optimize model <#convert-and-optimize-model>`__ - - - `Compress model weights to - 4-bit <#compress-model-weights-to-4-bit>`__ - -- `Select inference device <#select-inference-device>`__ -- `Prepare model inference - pipeline <#prepare-model-inference-pipeline>`__ -- `Run model inference <#run-model-inference>`__ -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import platform - - %pip install -q "transformers>4.36" "torch>=2.1" "torchvision" "einops" "timm" "Pillow" "gradio>=4.36" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "nncf>=2.14.0" "datasets" - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q -U "openvino>=2024.5" "openvino-tokenizers>=2024.5" "openvino-genai>=2024.5" - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - -.. code:: ipython3 - - from pathlib import Path - import requests - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/internvl2/gradio_helper.py") - open("gradio_helper.py", "w", encoding="utf-8").write(r.text) - - if not Path("notebook_utils.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("notebook_utils.py", "w", encoding="utf-8").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py") - open("cmd_helper.py", "w", encoding="utf-8").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("internvl2.ipynb") - -Select model ------------- - - - -There are multiple InternVL2 models available in `models -collection `__. -You can select one of them for conversion and optimization in notebook -using widget bellow: - -.. code:: ipython3 - - model_ids = ["OpenGVLab/InternVL2-1B", "OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-4B", "OpenGVLab/InternVL2-8B"] - - - def model_selector(default=model_ids[0]): - import ipywidgets as widgets - - model_checkpoint = widgets.Dropdown( - options=model_ids, - default=default, - description="Model:", - ) - return model_checkpoint - - - model_id = model_selector() - - model_id - - - - -.. parsed-literal:: - - Dropdown(description='Model:', options=('OpenGVLab/InternVL2-1B', 'OpenGVLab/InternVL2-2B', 'OpenGVLab/InternV… - - - -.. code:: ipython3 - - print(f"Selected {model_id.value}") - pt_model_id = model_id.value - model_dir = Path(pt_model_id.split("/")[-1]) - - -.. parsed-literal:: - - Selected OpenGVLab/InternVL2-1B - - -Convert and Optimize model --------------------------- - - - -Our model conversion and optimization consist of following steps: 1. -Download original PyTorch model. 2. Convert model to OpenVINO format. 3. -Compress model weights using NNCF. - -Let’s consider each step more deeply. - -Convert model to OpenVINO IR format using Optimum CLI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate -Representation format. For convenience, we will use OpenVINO integration -with HuggingFace Optimum. `Optimum -Intel `__ is the -interface between the Transformers and Diffusers libraries and the -different tools and libraries provided by Intel to accelerate end-to-end -pipelines on Intel architectures. - -Among other use cases, Optimum Intel provides a simple interface to -optimize your Transformers and Diffusers models, convert them to the -OpenVINO Intermediate Representation (IR) format and run inference using -OpenVINO Runtime. ``optimum-cli`` provides command line interface for -model conversion and optimization. - -General command format: - -.. code:: bash - - optimum-cli export openvino --model --task - -where task is task to export the model for, if not specified, the task -will be auto-inferred based on the model. You can find a mapping between -tasks and model classes in Optimum TaskManager -`documentation `__. -Additionally, you can specify weights compression using -``--weight-format`` argument with one of following options: ``fp32``, -``fp16``, ``int8`` and ``int4``. For int8 and int4 -`nncf `__ will be used for -weight compression. More details about model export provided in `Optimum -Intel -documentation `__. - -Compress model weights to 4-bit -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -For reducing memory -consumption, weights compression optimization can be applied using -`NNCF `__ via ``optimum-cli`` -command. In this tutorial we will demonstrates how to apply accurate -int4 weight quantization using AWQ method. - -.. raw:: html - -
- -.. raw:: html - - - -Click here for more details about weight compression - -.. raw:: html - - - -Weight compression aims to reduce the memory footprint of a model. It -can also lead to significant performance improvement for large -memory-bound models, such as Large Language Models (LLMs). LLMs and -other models, which require extensive memory to store the weights during -inference, can benefit from weight compression in the following ways: - -- enabling the inference of exceptionally large models that cannot be - accommodated in the memory of the device; - -- improving the inference performance of the models by reducing the - latency of the memory access when computing the operations with - weights, for example, Linear layers. - -`Neural Network Compression Framework -(NNCF) `__ provides 4-bit / -8-bit mixed weight quantization as a compression method primarily -designed to optimize LLMs. The main difference between weights -compression and full model quantization (post-training quantization) is -that activations remain floating-point in the case of weights -compression which leads to a better accuracy. Weight compression for -LLMs provides a solid inference performance improvement which is on par -with the performance of the full model quantization. In addition, weight -compression is data-free and does not require a calibration dataset, -making it easy to use. - -Usually 4-bit compression allows to get maximal speedup and minimal -memory footprint comparing with 8-bit compression, but in the same time -it may significantly drop model accuracy. `Activation-aware Weight -Quantization `__ (AWQ) is an algorithm -that tunes model weights for more accurate INT4 compression. It slightly -improves generation quality of compressed models, but requires -additional time for tuning weights on a calibration dataset. - -More details about weights compression, can be found in `OpenVINO -documentation `__. - -.. raw:: html - -
- -.. code:: ipython3 - - from cmd_helper import optimum_cli - - if not model_dir.exists(): - optimum_cli( - model_id.value, model_dir, additional_args={"trust-remote-code": "", "weight-format": "int4", "dataset": "contextual", "awq": "", "num-samples": "32"} - ) - - - -**Export command:** - - - -``optimum-cli export openvino --model OpenGVLab/InternVL2-1B InternVL2-1B --trust-remote-code --weight-format int4 --dataset contextual --awq --num-samples 32`` - - -.. parsed-literal:: - - 2024-11-20 12:30:38.063041: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-20 12:30:38.076313: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered - WARNING: All log messages before absl::InitializeLog() is called are written to STDERR - E0000 00:00:1732091438.091128 419590 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered - E0000 00:00:1732091438.095600 419590 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered - 2024-11-20 12:30:38.110828: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - Attempt to save config using standard API has failed with 'architectures'. There may be an issue with model config, please check its correctness before usage. - The class `optimum.bettertransformers.transformation.BetterTransformer` is deprecated and will be removed in a future release. - WARNING:root:Cannot apply model.to_bettertransformer because of the exception: - The model type qwen2 is not yet supported to be used with BetterTransformer. Feel free to open an issue at https://github.com/huggingface/optimum/issues if you would like this model type to be supported. Currently supported models are: dict_keys(['albert', 'bark', 'bart', 'bert', 'bert-generation', 'blenderbot', 'bloom', 'camembert', 'blip-2', 'clip', 'codegen', 'data2vec-text', 'deit', 'distilbert', 'electra', 'ernie', 'fsmt', 'gpt2', 'gptj', 'gpt_neo', 'gpt_neox', 'hubert', 'layoutlm', 'm2m_100', 'marian', 'markuplm', 'mbart', 'opt', 'pegasus', 'rembert', 'prophetnet', 'roberta', 'roc_bert', 'roformer', 'splinter', 'tapas', 't5', 'vilt', 'vit', 'vit_mae', 'vit_msn', 'wav2vec2', 'xlm-roberta', 'yolos']).. Usage model with stateful=True may be non-effective if model does not contain torch.functional.scaled_dot_product_attention - `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. - We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format) - /home/ea/work/py311/lib/python3.11/site-packages/transformers/cache_utils.py:458: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. - or len(self.key_cache[layer_idx]) == 0 # the layer has no cache - /home/ea/work/py311/lib/python3.11/site-packages/optimum/exporters/openvino/model_patcher.py:506: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if sequence_length != 1: - /home/ea/work/py311/lib/python3.11/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. - elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors - /home/ea/work/py311/lib/python3.11/site-packages/transformers/models/qwen2/modeling_qwen2.py:329: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): - /home/ea/.cache/huggingface/modules/transformers_modules/OpenGVLab/InternVL2-1B/a84c71e158b16180df4fd1c5fe963fdf54b2cd43/modeling_internvl_chat.py:195: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - h = w = int(vit_embeds.shape[1] ** 0.5) - /home/ea/.cache/huggingface/modules/transformers_modules/OpenGVLab/InternVL2-1B/a84c71e158b16180df4fd1c5fe963fdf54b2cd43/modeling_internvl_chat.py:169: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - x = x.view(n, w, int(h * scale_factor), int(c / scale_factor)) - /home/ea/.cache/huggingface/modules/transformers_modules/OpenGVLab/InternVL2-1B/a84c71e158b16180df4fd1c5fe963fdf54b2cd43/modeling_internvl_chat.py:173: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - x = x.view(n, int(h * scale_factor), int(w * scale_factor), - /home/ea/.cache/huggingface/modules/transformers_modules/OpenGVLab/InternVL2-1B/a84c71e158b16180df4fd1c5fe963fdf54b2cd43/modeling_internvl_chat.py:174: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - int(c / (scale_factor * scale_factor))) - - -.. parsed-literal:: - - FlashAttention2 is not installed. - - -.. parsed-literal:: - - Generating test split: 100%|██████████| 506/506 [00:00<00:00, 22956.39 examples/s] - Collecting calibration dataset: 100%|██████████| 32/32 [04:18<00:00, 8.08s/it] - - -.. parsed-literal:: - - Statistics collection ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 32/32 • 0:02:34 • 0:00:00181m0:00:04181m0:00:08:19 - [?25hINFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 28% (1 / 169) │ 0% (0 / 168) │ - ├────────────────┼─────────────────────────────┼────────────────────────���───────────────┤ - │ 4 │ 72% (168 / 169) │ 100% (168 / 168) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - Applying AWQ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 24/24 • 0:01:54 • 0:00:0054 • 0:00:06;2;97;53;69m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ • 0:00:00 - Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 0:00:17 • 0:00:00;0;104;181m0:00:01181m0:00:01 - [?25hINFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (99 / 99) │ 100% (99 / 99) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━���━━━━━━━━━━━━━━━┙ - Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 0:00:01 • 0:00:00• 0:00:01:01 - [?25hINFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (1 / 1) │ 100% (1 / 1) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━���━━━━━━━━━━━━━━━┙ - Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 0:00:00 • 0:00:00 - [?25h - -.. parsed-literal:: - - Attempt to save config using standard API has failed with 'architectures'. There may be an issue with model config, please check its correctness before usage. - - -Select inference device ------------------------ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget(default="CPU", exclude=["NPU", "AUTO"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Prepare model inference pipeline --------------------------------- - - - -`OpenVINO™ GenAI `__ -is a library of the most popular Generative AI model pipelines, -optimized execution methods, and samples that run on top of highly -performant `OpenVINO -Runtime `__. - -This library is friendly to PC and laptop execution, and optimized for -resource consumption. It requires no external dependencies to run -generative models as it already includes all the core functionality -(e.g. tokenization via openvino-tokenizers). OpenVINO™ GenAI is a flavor -of OpenVINO™, aiming to simplify running inference of generative AI -models. It hides the complexity of the generation process and minimizes -the amount of code required. - -Inference Visual language models can be implemented using OpenVINO GenAI -``VLMPipeline`` class. Similarly to LLMPipeline, that we discussed in -this -`notebook `__. -It supports chat mode with preserving conversational history inside -pipeline, that allows us effectively implements chatbot that supports -conversation about provided images content. For pipeline initialization -we should provide path to model directory and inference device. - -.. code:: ipython3 - - import openvino_genai as ov_genai - - ov_model = ov_genai.VLMPipeline(model_dir, device=device.value) - -Run model inference -------------------- - - - -For preparing input data, ``VLMPipeline`` use tokenizer and image -processor inside, we just need to convert image to input OpenVINO tensor -and provide question as string. Additionally, we can provides options -for controlling generation process (e.g. number of maximum generated -tokens or using multinomial sampling for decoding instead of greedy -search approach) using ``GenerationConfig``. - -Generation process for long response may be time consuming, for -accessing partial result as soon as it is generated without waiting when -whole process finished, Streaming API can be used. Token streaming is -the mode in which the generative system returns the tokens one by one as -the model generates them. This enables showing progressive generations -to the user rather than waiting for the whole generation. Streaming is -an essential aspect of the end-user experience as it reduces latency, -one of the most critical aspects of a smooth experience. - -.. code:: ipython3 - - import requests - from PIL import Image - from io import BytesIO - import numpy as np - import openvino as ov - - config = ov_genai.GenerationConfig() - config.max_new_tokens = 100 - - - def load_image(image_file): - if isinstance(image_file, str) and (image_file.startswith("http") or image_file.startswith("https")): - response = requests.get(image_file) - image = Image.open(BytesIO(response.content)).convert("RGB") - else: - image = Image.open(image_file).convert("RGB") - image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.byte) - return image, ov.Tensor(image_data) - - - EXAMPLE_IMAGE = Path("examples_image1.jpg") - EXAMPLE_IMAGE_URL = "https://huggingface.co/OpenGVLab/InternVL2-2B/resolve/main/examples/image1.jpg" - - if not EXAMPLE_IMAGE.exists(): - img_data = requests.get(EXAMPLE_IMAGE_URL).content - with EXAMPLE_IMAGE.open("wb") as handler: - handler.write(img_data) - - - def streamer(subword: str) -> bool: - """ - - Args: - subword: sub-word of the generated text. - - Returns: Return flag corresponds whether generation should be stopped. - - """ - print(subword, end="", flush=True) - - - question = "Please describe the image shortly" - - - image, image_tensor = load_image(EXAMPLE_IMAGE) - display(image) - print(f"User: {question}\n") - print("Assistant:") - output = ov_model.generate(question, image=image_tensor, generation_config=config, streamer=streamer) - - - -.. image:: internvl2-with-output_files/internvl2-with-output_14_0.png - - -.. parsed-literal:: - - User: Please describe the image shortly - - Assistant: - . - - The image shows a red panda, a type of mammal known for its distinctive red fur and white markings. The animal is resting on a wooden structure, possibly a platform or a platform-like object, with its head turned slightly towards the camera. The background is a natural setting, with trees and foliage visible, suggesting that the red panda is in a forested or wooded area. The red panda's eyes are large and expressive, and its ears are perked up, indicating that it is alert - -Interactive demo ----------------- - - - -.. code:: ipython3 - - from gradio_helper import make_demo - - demo = make_demo(ov_model) - try: - demo.launch(debug=True, height=600) - except Exception: - demo.launch(debug=True, share=True, height=600) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/internvl2-with-output_files/internvl2-with-output_14_0.jpg b/docs/notebooks/internvl2-with-output_files/internvl2-with-output_14_0.jpg deleted file mode 100644 index a9ee87167141d3..00000000000000 --- a/docs/notebooks/internvl2-with-output_files/internvl2-with-output_14_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61d256b444d9402e44be557ad844dd0f43ad38a97edbcf9933ca04e76ec3046b -size 85717 diff --git a/docs/notebooks/internvl2-with-output_files/internvl2-with-output_14_0.png b/docs/notebooks/internvl2-with-output_files/internvl2-with-output_14_0.png deleted file mode 100644 index 1c8e906540c6bb..00000000000000 --- a/docs/notebooks/internvl2-with-output_files/internvl2-with-output_14_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e5ed8b919f743d5fbc4d225f50e9537a2ecfbe6bcdfa848bc3eeb0dabe300b0f -size 945990 diff --git a/docs/notebooks/janus-multimodal-generation-with-output.rst b/docs/notebooks/janus-multimodal-generation-with-output.rst deleted file mode 100644 index 5aedab76202160..00000000000000 --- a/docs/notebooks/janus-multimodal-generation-with-output.rst +++ /dev/null @@ -1,643 +0,0 @@ -Multimodal understanding and generation with Janus-Pro and OpenVINO -=================================================================== - -Janus is a novel autoregressive framework that unifies multimodal -understanding and generation. It addresses the limitations of previous -approaches by decoupling visual encoding into separate pathways, while -still utilizing a single, unified transformer architecture for -processing. The decoupling not only alleviates the conflict between the -visual encoder’s roles in understanding and generation, but also -enhances the framework’s flexibility. Janus surpasses previous unified -model and matches or exceeds the performance of task-specific models. -The simplicity, high flexibility, and effectiveness of Janus make it a -strong candidate for next-generation unified multimodal models. - -More details can be found in the -`paper `__, original -`repository `__ and `model -card `__ - -Janus-Pro is an advanced version of Janus, significantly improving -multimodal understanding and visual generation. More details can be -found in -`paper `__. - -In this tutorial we consider how to run and optimize Janus using -OpenVINO. - -**Table of contents:** - -- `Prerequisites <#prerequisites>`__ -- `Convert and Optimize model <#convert-and-optimize-model>`__ - - - `Compress model weights to - 4-bit <#compress-model-weights-to-4-bit>`__ - -- `Create Inference Pipeline <#create-inference-pipeline>`__ - - - `Select Inference Device <#select-inference-device>`__ - - `Run visual language chat <#run-visual-language-chat>`__ - - `Run Image generation <#run-image-generation>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - from pathlib import Path - import requests - - utility_files = ["notebook_utils.py"] - local_helpers = ["ov_janus_helper.py", "gradio_helper.py"] - - base_utils_url = "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/" - base_local_files_url = "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/janus-multimodal-generation/" - - - for util_path in utility_files: - if not Path(util_path).exists(): - r = requests.get(base_utils_url + util_path) - with open(util_path, "w") as f: - f.write(r.text) - - for util_path in local_helpers: - if not Path(util_path).exists(): - r = requests.get(base_local_files_url + util_path) - with open(util_path, "w") as f: - f.write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("janus-multimodal-generation.ipynb") - -.. code:: ipython3 - - import platform - - %pip install -q "gradio>=4.19" "torch>=2.2" "torchvision" "safetensors" "transformers>=4.45" "nncf>=2.14" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "git+https://github.com/deepseek-ai/Janus" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -U --pre "openvino>2024.5" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - - if platform.python_version_tuple()[1] == "9": - %pip install -q "transformers<4.48" - -Convert and Optimize model --------------------------- - - - -Janus is PyTorch model. OpenVINO supports PyTorch models via conversion -to OpenVINO Intermediate Representation (IR). `OpenVINO model conversion -API `__ -should be used for these purposes. ``ov.convert_model`` function accepts -original PyTorch model instance and example input for tracing and -returns ``ov.Model`` representing this model in OpenVINO framework. -Converted model can be used for saving on disk using ``ov.save_model`` -function or directly loading on device using ``core.complie_model``. - -The script ``ov_janus_helper.py`` contains helper function for model -conversion, please check its content if you interested in conversion -details. - -.. raw:: html - -
- -.. raw:: html - - - -Click here for more detailed explanation of conversion steps - -.. raw:: html - - - -Janus is autoregressive transformer generative model, it means that each -next model step depends from model output from previous step. The -generation approach is based on the assumption that the probability -distribution of a token sequence can be decomposed into the product of -conditional next token distributions. In other words, model predicts the -next token in the loop guided by previously generated tokens until the -stop-condition will be not reached (generated sequence of maximum length -or end of generation token obtained). The way the next token will be -selected over predicted probabilities is driven by the selected decoding -methodology. You can find more information about the most popular -decoding methods in this blog. The entry point for the generation -process for models from the Hugging Face Transformers library is the -``generate`` method. You can find more information about its parameters -and configuration in the documentation. To preserve flexibility in the -selection decoding methodology, we will convert only model inference for -one step. - -For both tasks, image understanding and image generation, Janus utilizes -the same basic transformer architecture in ``language_model`` and change -only components responsible for preparing input embeddings (joined image -embeddings prepared using ``vision_embeddings_model`` and text -embeddings prepared using ``text_embeddings_model`` for image -understanding and ``text_embeddings_model`` on the first step as initial -prompt embeddings and ``gen_embeddings_model`` for the next) and -conversion final hidden state to tokens probabilities (``lm_head`` for -text tokens, ``gen_head`` for image tokens). Additionally, for image -generation model uses ``gen_decoder`` to convert generated image tokens -to images. - -To sum up above, model consists of 7 parts: \* **Image Embeddings** for -encoding input images into embedding space in image understanding task. -\* **Text Embedding** for conversion input text tokens into embedding -space \* **Gen Embeddings** for encoding image generation tokens to -embeddings space in image generation task \* **Language Model** for -generation hidden state guided by input embeddings \* **LM Head** for -conversion Language Model hidden state to text generation token -probabilities \* **Gen Head** for conversion Language Model hidden state -to image generation token probabilities \* **Gen Decoder** for decoding -generated image from latent token space to image tensor space. - -For preserving original model flexibility of switching between tasks, we -also should preserve original model partitioning and convert each model -part separately. - -.. raw:: html - -
- -Compress model weights to 4-bit -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -For reducing memory -consumption, weights compression optimization can be applied using -`NNCF `__. - -.. raw:: html - -
- -.. raw:: html - - - -Click here for more details about weight compression - -.. raw:: html - - - -Weight compression aims to reduce the memory footprint of a model. It -can also lead to significant performance improvement for large -memory-bound models, such as Large Language Models (LLMs). LLMs and -other models, which require extensive memory to store the weights during -inference, can benefit from weight compression in the following ways: - -- enabling the inference of exceptionally large models that cannot be - accommodated in the memory of the device; - -- improving the inference performance of the models by reducing the - latency of the memory access when computing the operations with - weights, for example, Linear layers. - -`Neural Network Compression Framework -(NNCF) `__ provides 4-bit / -8-bit mixed weight quantization as a compression method primarily -designed to optimize LLMs. The main difference between weights -compression and full model quantization (post-training quantization) is -that activations remain floating-point in the case of weights -compression which leads to a better accuracy. Weight compression for -LLMs provides a solid inference performance improvement which is on par -with the performance of the full model quantization. In addition, weight -compression is data-free and does not require a calibration dataset, -making it easy to use. - -``nncf.compress_weights`` function can be used for performing weights -compression. The function accepts an OpenVINO model and other -compression parameters. Compared to INT8 compression, INT4 compression -improves performance even more, but introduces a minor drop in -prediction quality. - -More details about weights compression, can be found in `OpenVINO -documentation `__. - -.. raw:: html - -
- -.. code:: ipython3 - - import nncf - from ov_janus_helper import convert_janus_model - import ipywidgets as widgets - - model_ids = ["deepseek-ai/Janus-Pro-1B", "deepseek-ai/Janus-Pro-7B", "deepseek-ai/Janus-1.3B"] - - compression_configuration = { - "mode": nncf.CompressWeightsMode.INT4_ASYM, - "group_size": 64, - "ratio": 1.0, - } - - # uncomment the line to see model conversion code - # ??convert_janus_model - - model_id = widgets.Dropdown(options=model_ids, value=model_ids[0]) - model_id - - -.. parsed-literal:: - - :247: DeprecationWarning: The `openvino.runtime` module is deprecated and will be removed in the 2026.0 release. Please replace `openvino.runtime` with `openvino`. - - -.. parsed-literal:: - - Python version is above 3.10, patching the collections module. - - -.. parsed-literal:: - - 2025-01-28 11:36:54.976268: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2025-01-28 11:36:54.989484: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered - WARNING: All log messages before absl::InitializeLog() is called are written to STDERR - E0000 00:00:1738049815.003897 2873562 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered - E0000 00:00:1738049815.008201 2873562 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered - 2025-01-28 11:36:55.024219: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - /home/ea/work/py311/lib/python3.11/site-packages/transformers/models/auto/image_processing_auto.py:524: FutureWarning: The image_processor_class argument is deprecated and will be removed in v4.42. Please use `slow_image_processor_class`, or `fast_image_processor_class` instead - warnings.warn( - /home/ea/work/py311/lib/python3.11/site-packages/wandb/analytics/sentry.py:82: SentryHubDeprecationWarning: `sentry_sdk.Hub` is deprecated and will be removed in a future major release. Please consult our 1.x to 2.x migration guide for details on how to migrate `Hub` usage to the new API: https://docs.sentry.io/platforms/python/migration/1.x-to-2.x - self.hub = sentry_sdk.Hub(client) - - - - -.. parsed-literal:: - - Dropdown(options=('deepseek-ai/Janus-Pro-1B', 'deepseek-ai/Janus-Pro-7B', 'deepseek-ai/Janus-1.3B'), value='de… - - - -.. code:: ipython3 - - from pathlib import Path - - model_path = Path(model_id.value.split("/")[-1] + "-ov") - convert_janus_model(model_id.value, model_path, compression_configuration) - - -.. parsed-literal:: - - ⌛ Janus-Pro-1B conversion started. Be patient, it may takes some time. - ⌛ Load Original model - - - -.. parsed-literal:: - - preprocessor_config.json: 0%| | 0.00/346 [00:00. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message. - - - -.. parsed-literal:: - - processor_config.json: 0%| | 0.00/210 [00:00{input_prompt}\n", - "images": [str(image_path)], - }, - {"role": "Assistant", "content": ""}, - ] - pil_images = load_pil_images(conversation) - -.. code:: ipython3 - - from transformers import TextStreamer - - prepare_inputs = processor(conversations=conversation, images=pil_images, force_batchify=True) - # run image encoder to get the image embeddings - inputs_embeds = ov_model.prepare_inputs_embeds(**prepare_inputs) - - streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True) - - print(f"Question:\n{input_prompt}") - display(pil_images[0]) - print("Answer:") - - answer_token_ids = ov_model.language_model.generate( - inputs_embeds=inputs_embeds, - attention_mask=prepare_inputs.attention_mask, - pad_token_id=processor.tokenizer.eos_token_id, - bos_token_id=processor.tokenizer.bos_token_id, - eos_token_id=processor.tokenizer.eos_token_id, - max_new_tokens=128, - do_sample=False, - streamer=streamer, - ) - - -.. parsed-literal:: - - Question: - Describe image in details - - - -.. image:: janus-multimodal-generation-with-output_files/janus-multimodal-generation-with-output_14_1.png - - -.. parsed-literal:: - - Answer: - The image shows a gray tabby cat lying on its back inside an open cardboard box. The cat appears to be relaxed and comfortable, with its belly exposed and its paws relaxed. The box is placed on a light-colored carpet, and there is a beige sofa in the background. The overall setting appears to be a cozy indoor environment, likely a living room. - - -Run Image generation -~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from ov_janus_helper import generate_image - - # Uncomment the line to see image generation code - # ??generate_image - -.. code:: ipython3 - - from transformers import set_seed - - set_seed(12345) - - images = generate_image( - ov_model, - processor, - "A close-up professional photo of Yorkshire Terrier on beach, extrimely detailed, hyper realistic, full hd", - output_dir=None, - parallel_size=1, - ) - - - -.. parsed-literal:: - - 0%| | 0/576 [00:00`__ is a Python library for -accelerator-oriented array computation and program transformation, -designed for high-performance numerical computing and large-scale -machine learning. JAX provides a familiar NumPy-style API for ease of -adoption by researchers and engineers. - -In this tutorial we will show how to convert JAX -`ViT `__ -and -`Mixer `__ -models in OpenVINO format. - -.. raw:: html - -
- -.. raw:: html - - - -Click here for more detailed information about the models - -.. raw:: html - - - -Vision Transformer -~~~~~~~~~~~~~~~~~~ - -Overview of the model: authors split an image into fixed-size patches, -linearly embed each of them, add position embeddings, and feed the -resulting sequence of vectors to a standard Transformer encoder. In -order to perform classification, authors use the standard approach of -adding an extra learnable “classification token” to the sequence. - -MLP-Mixer -~~~~~~~~~ - -MLP-Mixer (Mixer for short) consists of per-patch linear embeddings, -Mixer layers, and a classifier head. Mixer layers contain one -token-mixing MLP and one channel-mixing MLP, each consisting of two -fully-connected layers and a GELU nonlinearity. Other components -include: skip-connections, dropout, and linear classifier head. - -.. raw:: html - -
- - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Load and run the original model and a - sample <#load-and-run-the-original-model-and-a-sample>`__ -- `Convert the model to OpenVINO - IR <#convert-the-model-to-openvino-ir>`__ -- `Compiling the model <#compiling-the-model>`__ -- `Run OpenVINO model inference <#run-openvino-model-inference>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utilspy").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("jax-classification-to-openvino.ipynb") - -.. code:: ipython3 - - from cmd_helper import clone_repo - - - clone_repo("https://github.com/google-research/vision_transformer.git") - -.. code:: ipython3 - - %pip install --pre -Uq "openvino>=2024.5.0" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - %pip install -q "tensorflow-macos>=2.5 jax-metal>=0.4.2"; sys_platform == 'darwin' and platform_machine == 'arm64'" # macOS M1 and M2 - %pip install -q "tensorflow>=2.5 jax>=0.4.2"; sys_platform == 'darwin' and platform_machine != 'arm64'" # macOS x86 - %pip install -q "tensorflow-cpu>=2.5 jax>=0.4.2"; sys_platform != 'darwin'" - %pip install -q Pillow "absl-py>=0.12.0" "flax>=0.6.4" "pandas>=1.1.0" tf_keras tqdm "einops>=0.3.0" "ml-collections>=0.1.0" - -.. code:: ipython3 - - import PIL - import jax - import numpy as np - - from vit_jax import checkpoint - from vit_jax import models_vit - from vit_jax import models_mixer - from vit_jax.configs import models as models_config - - import openvino as ov - -.. code:: ipython3 - - import ipywidgets as widgets - - available_models = ["ViT-B_32", "Mixer-B_16"] - - - model_to_use = widgets.Select( - options=available_models, - value=available_models[0], - description="Select model:", - disabled=False, - ) - - model_to_use - - - - -.. parsed-literal:: - - Select(description='Select model:', options=('ViT-B_32', 'Mixer-B_16'), value='ViT-B_32') - - - -Load and run the original model and a sample --------------------------------------------- - - - -Download a pre-trained model. - -.. code:: ipython3 - - from notebook_utils import download_file - - - model_name = model_to_use.value - model_config = models_config.MODEL_CONFIGS[model_name] - - - if model_name.startswith("Mixer"): - # Download model trained on imagenet2012 - if not Path(f"{model_name}_imagenet2012.npz").exists(): - download_file(f"https://storage.googleapis.com/mixer_models/imagenet1k/{model_name}.npz", filename=f"{model_name}_imagenet2012.npz") - model = models_mixer.MlpMixer(num_classes=1000, **model_config) - else: - # Download model pre-trained on imagenet21k and fine-tuned on imagenet2012. - if not Path(f"{model_name}_imagenet2012.npz").exists(): - model_name_path = download_file( - f"https://storage.googleapis.com/vit_models/imagenet21k+imagenet2012/{model_name}.npz", filename=f"{model_name}_imagenet2012.npz" - ) - model = models_vit.VisionTransformer(num_classes=1000, **model_config) - - - -.. parsed-literal:: - - ViT-B_32_imagenet2012.npz: 0%| | 0.00/337M [00:00`__ -should be used for these purposes. ``ov.convert_model`` function accepts -original JAX model instance and example input for tracing and returns -``ov.Model`` representing this model in OpenVINO framework. Converted -model can be used for saving on disk using ``ov.save_model`` function or -directly loading on device using ``core.complie_model``. - -Before conversion we need to create the -`Jaxprs `__ -(JAX’s internal intermediate representation (IR) of programs) object by -tracing a Python function using the -`jax.make_jaxpr `__ -function. [``jax.make_jaxpr``] take a function as argument, that should -perform the forward pass. In our case it is calling of ``model.apply`` -method. But ``model.apply`` requires not only input data, but also -``params`` and keyword argument ``train=False`` in our case. To handle -it create a wrapper function ``model_apply`` that calls -``model.apply(params, x, train=False)``. - -.. code:: ipython3 - - from pathlib import Path - - - model_path = Path(f"models/{model_name}.xml") - - - def model_apply(x): - return model.apply(dict(params=params), x, train=False) - - - jaxpr = jax.make_jaxpr(model_apply)((np.array(img) / 128 - 1)[None, ...]) - - converted_model = ov.convert_model(jaxpr) - ov.save_model(converted_model, model_path) - -Compiling the model -------------------- - - - -Select device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - from notebook_utils import device_widget - - - core = ov.Core() - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - compiled_model = core.compile_model(model_path, device.value) - -Run OpenVINO model inference ----------------------------- - -.. code:: ipython3 - - (logits_ov,) = list(compiled_model(data).values())[0] - - preds = np.array(jax.nn.softmax(logits_ov)) - for idx in preds.argsort()[:-11:-1]: - print(f"{preds[idx]:.5f} : {imagenet_labels[idx]}", end="") - - -.. parsed-literal:: - - 0.95255 : alp - 0.03881 : valley, vale - 0.00192 : cliff, drop, drop-off - 0.00173 : ski - 0.00059 : lakeside, lakeshore - 0.00049 : promontory, headland, head, foreland - 0.00036 : volcano - 0.00021 : snowmobile - 0.00017 : mountain_bike, all-terrain_bike, off-roader - 0.00017 : mountain_tent - - -.. code:: ipython3 - - # Cleanup - # %pip uninstall -q -y "tensorflow-cpu" tensorflow tf_keras diff --git a/docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.jpg b/docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.jpg deleted file mode 100644 index 4e389f1fcb75af..00000000000000 --- a/docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7b9ce29fc2d800faa2667de9fc47770370f12c829217c22142bfcd1f5e1a2752 -size 33195 diff --git a/docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.png b/docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.png deleted file mode 100644 index 901c02bacbed30..00000000000000 --- a/docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ffe240660061089dfc38c95d77b074051cc37b794c4d096e5841cf8d575311d9 -size 237944 diff --git a/docs/notebooks/jina-clip-with-output.rst b/docs/notebooks/jina-clip-with-output.rst deleted file mode 100644 index 36043127ca11ec..00000000000000 --- a/docs/notebooks/jina-clip-with-output.rst +++ /dev/null @@ -1,1001 +0,0 @@ -CLIP model with Jina CLIP and OpenVINO --------------------------------------- - -`jina-clip-v1 `__ is a -state-of-the-art English multimodal(text-image) embedding model trained -by `Jina AI `__. It -bridges the gap between traditional text embedding models, which excel -in text-to-text retrieval but are incapable of cross-modal tasks, and -models that effectively align image and text embeddings but are not -optimized for text-to-text retrieval. jina-clip-v1 offers robust -performance in both domains. Its dual capability makes it an excellent -tool for multimodal retrieval-augmented generation (MuRAG) applications, -allowing seamless text-to-text and text-to-image searches within a -single model. jina-clip-v1 can be used for a variety of multimodal -applications, such as: image search by describing them in text, -multimodal question answering, multimodal content generation. Jina AI -has also provided the Embeddings API as an easy-to-use interface for -working with jina-clip-v1 and their other embedding models. - -In this notebook we will load the model with Hugging Face Transformers, -convert it to OpenVINO IR format, optimize it with NNCF and show the -life demo. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Instantiate model <#instantiate-model>`__ - - - `Prepare input data <#prepare-input-data>`__ - - `Run PyTorch model inference <#run-pytorch-model-inference>`__ - -- `Run OpenVINO model inference <#run-openvino-model-inference>`__ - - - `Prepare input data <#prepare-input-data>`__ - - `Convert Model to OpenVINO IR - format <#convert-model-to-openvino-ir-format>`__ - - `Select inference device <#select-inference-device>`__ - - `Compile model and run - inference <#compile-model-and-run-inference>`__ - -- `Quantize model to INT8 using - NNCF <#quantize-model-to-int8-using-nncf>`__ - - - `Prepare datasets <#prepare-datasets>`__ - - - `Dataset with text data <#dataset-with-text-data>`__ - - `Dataset with image data <#dataset-with-image-data>`__ - - - `Perform quantization <#perform-quantization>`__ - - - `Quantization of text model <#quantization-of-text-model>`__ - - `Quantization of image model <#quantization-of-image-model>`__ - - - `Compare File Size <#compare-file-size>`__ - - `Compare inference time of the FP16 IR and quantized - models <#compare-inference-time-of-the-fp16-ir-and-quantized-models>`__ - -- `Gradio demo <#gradio-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import platform - - %pip install -q "openvino>=2024.6.0" "datasets>=2.20" "nncf>=2.14.0" - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "gradio>=4.19" "pillow" "einops" "timm" "transformers[torch]>=4.39" "torch>=2.1" "matplotlib>=3.4" "typing_extensions>=4.9" - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - - -.. parsed-literal:: - - ERROR: Could not find a version that satisfies the requirement openvino>=2024.6.0 (from versions: 2021.3.0, 2021.4.0, 2021.4.1, 2021.4.2, 2022.1.0, 2022.2.0, 2022.3.0, 2022.3.1, 2022.3.2, 2023.0.0.dev20230119, 2023.0.0.dev20230217, 2023.0.0.dev20230407, 2023.0.0.dev20230427, 2023.0.0, 2023.0.1, 2023.0.2, 2023.1.0.dev20230623, 2023.1.0.dev20230728, 2023.1.0.dev20230811, 2023.1.0, 2023.2.0.dev20230922, 2023.2.0, 2023.3.0, 2024.0.0, 2024.1.0, 2024.2.0, 2024.3.0, 2024.4.0, 2024.4.1.dev20240926) - ERROR: No matching distribution found for openvino>=2024.6.0 - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - - -Instantiate model ------------------ - - - -Let’s load the -`jinaai/jina-clip-v1 `__ -with Hugging Face Transformers. We creates PyTorch model class instance -with ``AutoModel``, load and initialize it with model configuration and -weights, using ``from_pretrained`` method. - -.. code:: ipython3 - - from transformers import AutoModel - - model = AutoModel.from_pretrained("jinaai/jina-clip-v1", trust_remote_code=True) - - -.. parsed-literal:: - - 2025-02-04 02:41:35.566346: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2025-02-04 02:41:35.599646: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2025-02-04 02:41:36.268007: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - - -Prepare input data -~~~~~~~~~~~~~~~~~~ - - - -The model can encode meaningful sentences in English as text input. -Image could be provided to model as local file path, URLs or directly -passing in the PIL.Image objects. - -.. code:: ipython3 - - from PIL import Image - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - # image input data - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file, device_widget, quantization_widget - - if not Path("data/furseal.png").exists(): - download_file( - "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/3f779fc1-c1b2-4dec-915a-64dae510a2bb", - "furseal.png", - directory="data", - ) - - img_furseal = Image.open("./data/furseal.png") - - if not Path("data/coco.jpg").exists(): - image_path = download_file( - "https://github.com/user-attachments/assets/1c66a05d-7442-45c2-a34c-bb08b95af7a6", - "coco.jpg", - directory="data", - ) - - img_coco = Image.open("./data/coco.jpg") - - IMAGE_INPUTS = [img_furseal, img_coco] - - # text input data - TEXT_INPUTS = ["Seal", "Cobra", "Rat", "Penguin", "Dog"] - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("jina-clip.ipynb") - - - -.. parsed-literal:: - - furseal.png: 0%| | 0.00/2.55M [00:00 self.linear_biases.shape[-1]: - - -.. code:: ipython3 - - fp16_vision_model_path = Path("jina-clip-vision_v1_fp16.xml") - - if not fp16_vision_model_path.exists(): - ov_vision_model = ov.convert_model(model.vision_model, example_input=vision_inputs["pixel_values"]) - ov.save_model(ov_vision_model, fp16_vision_model_path) - - -.. parsed-literal:: - - /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/jinaai/jina-clip-implementation/51f02de9f2cf8afcd3bac4ce996859ba96f9f8e9/eva_model.py:471: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert h == self.img_size[0] and w == self.img_size[1], ( - - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -For starting work, please select inference device from dropdown list. - -.. code:: ipython3 - - device = device_widget() - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Compile model and run inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - compiled_text_model = core.compile_model(fp16_text_model_path, device.value) - compiled_vision_model = core.compile_model(fp16_vision_model_path, device.value) - -.. code:: ipython3 - - text_ov_res = compiled_text_model(text_inputs["input_ids"]) - vis_ov_res = compiled_vision_model(vision_inputs["pixel_values"]) - - res = calc_simularity_softmax(vis_ov_res[0], text_ov_res[0]) - visionize_result(img_furseal, TEXT_INPUTS, np.array(res[0])) - - - -.. image:: jina-clip-with-output_files/jina-clip-with-output_21_0.png - - -Quantize model to INT8 using NNCF ---------------------------------- - - - -Lets speed up the model by applying 8-bit post-training quantization -from `NNCF `__ (Neural Network -Compression Framework) and infer quantized model via OpenVINO™ Toolkit. -`NNCF `__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. The optimization process contains the following steps: - -1. Prepare quantization dataset -2. Quantize the converted OpenVINO model with NNCF with - ``nncf.quantize()``. -3. Save the ``INT8`` model using ``openvino.save_model()`` function. -4. Compare model size of converted and quantized models. -5. Compare performance of converted and quantized models. - -.. - - **Note:** quantization process may require additional time and memory - for performing. You can disable it using widget below: - -.. code:: ipython3 - - to_quantize = quantization_widget() - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -.. code:: ipython3 - - if not Path("skip_kernel_extension.py").exists(): - # Fetch `skip_kernel_extension` module - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - int8_text_model_path = Path("jina-clip-text_v1_int8.xml") - int8_vision_model_path = Path("jina-clip-vision_v1_int8.xml") - - %load_ext skip_kernel_extension - -Prepare datasets -~~~~~~~~~~~~~~~~ - - - -The `Conceptual -Captions `__ dataset -consisting of ~3.3M images annotated with captions is used to quantize -model. - -Dataset with text data -^^^^^^^^^^^^^^^^^^^^^^ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import torch - from datasets import load_dataset - from tqdm.notebook import tqdm - import requests - from io import BytesIO - import numpy as np - from PIL import Image - from requests.packages.urllib3.exceptions import InsecureRequestWarning - requests.packages.urllib3.disable_warnings(InsecureRequestWarning) - - - def check_text_data(data): - """ - Check if the given data is text-based. - """ - if isinstance(data, str): - return True - if isinstance(data, list): - return all(isinstance(x, str) for x in data) - return False - - - def collate_fn_text(example, text_column="caption"): - """ - Preprocesses an example by loading and transforming text data. - Checks if the text data in the example is valid by calling the `check_text_data` function. - If there is any error during the download process, returns None. - Returns the preprocessed inputs with transformed image and text data. - """ - assert len(example) == 1 - example = example[0] - - if not check_text_data(example[text_column]): - raise ValueError("Text data is not valid") - - text_input = tokenizer( - example[text_column], - return_tensors='pt', - **tokenizer_kwargs) - - return text_input - - - def prepare_calibration_data_text(dataloader, init_steps): - """ - This function prepares calibration data from a dataloader for a specified number of initialization steps. - It iterates over the dataloader, fetching batches and storing the relevant data. - """ - data = [] - print(f"Fetching {init_steps} samples for the initialization...") - with tqdm(total=init_steps) as pbar: - for batch in dataloader: - if len(data) == init_steps: - break - if batch: - pbar.update(1) - with torch.no_grad(): - data.append(batch["input_ids"].to("cpu")) - return data - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import logging - import nncf - - if not int8_text_model_path.exists(): - dataset = load_dataset("google-research-datasets/conceptual_captions", trust_remote_code=True) - train_dataset = dataset["train"].shuffle(seed=42) - - dataloader_text = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn_text, batch_size=1) - calibration_data_text = prepare_calibration_data_text(dataloader_text, 50) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - Fetching 50 samples for the initialization... - - - -.. parsed-literal:: - - 0%| | 0/50 [00:00`__ -- `Import the packages needed for successful - execution <#import-the-packages-needed-for-successful-execution>`__ - - - `Settings: Including path to the serialized model files and input - data - files <#settings-including-path-to-the-serialized-model-files-and-input-data-files>`__ - - `Download Model Checkpoint <#download-model-checkpoint>`__ - - `Defining the ConvE model - class <#defining-the-conve-model-class>`__ - - `Defining the dataloader <#defining-the-dataloader>`__ - - `Evaluate the trained ConvE - model <#evaluate-the-trained-conve-model>`__ - - `Prediction on the Knowledge - graph. <#prediction-on-the-knowledge-graph->`__ - - `Convert the trained PyTorch model to IR format for OpenVINO - inference <#convert-the-trained-pytorch-model-to-ir-format-for-openvino-inference>`__ - - `Evaluate the model performance with - OpenVINO <#evaluate-the-model-performance-with-openvino>`__ - -- `Select inference device <#select-inference-device>`__ - - - `Determine the platform specific speedup obtained through OpenVINO - graph - optimizations <#determine-the-platform-specific-speedup-obtained-through-openvino-graph-optimizations>`__ - - `Benchmark the converted OpenVINO model using benchmark - app <#benchmark-the-converted-openvino-model-using-benchmark-app>`__ - - `Conclusions <#conclusions>`__ - - `References <#references>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - %pip install -q "openvino>=2023.1.0" torch scikit-learn tqdm --extra-index-url https://download.pytorch.org/whl/cpu - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - - -Windows specific settings -------------------------- - - - -.. code:: ipython3 - - # On Windows, add the directory that contains cl.exe to the PATH - # to enable PyTorch to find the required C++ tools. - # This code assumes that Visual Studio 2019 is installed in the default directory. - # If you have a different C++ compiler, please add the correct path - # to os.environ["PATH"] directly. - # Note that the C++ Redistributable is not enough to run this notebook. - - # Adding the path to os.environ["LIB"] is not always required - # - it depends on the system's configuration - - import sys - - if sys.platform == "win32": - import distutils.command.build_ext - import os - from pathlib import Path - - VS_INSTALL_DIR = r"C:/Program Files (x86)/Microsoft Visual Studio" - cl_paths = sorted(list(Path(VS_INSTALL_DIR).glob("**/Hostx86/x64/cl.exe"))) - if len(cl_paths) == 0: - raise ValueError( - "Cannot find Visual Studio. This notebook requires a C++ compiler. If you installed " - "a C++ compiler, please add the directory that contains" - "cl.exe to `os.environ['PATH']`." - ) - else: - # If multiple versions of MSVC are installed, get the most recent version - cl_path = cl_paths[-1] - vs_dir = str(cl_path.parent) - os.environ["PATH"] += f"{os.pathsep}{vs_dir}" - # Code for finding the library dirs from - # https://stackoverflow.com/questions/47423246/get-pythons-lib-path - d = distutils.core.Distribution() - b = distutils.command.build_ext.build_ext(d) - b.finalize_options() - os.environ["LIB"] = os.pathsep.join(b.library_dirs) - print(f"Added {vs_dir} to PATH") - -Import the packages needed for successful execution ---------------------------------------------------- - - - -.. code:: ipython3 - - import json - from pathlib import Path - import sys - import time - - import numpy as np - import torch - from sklearn.metrics import accuracy_score - from torch.nn import functional as F, Parameter - from torch.nn.init import xavier_normal_ - - import openvino as ov - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("knowledge-graphs-conve.ipynb") - -Settings: Including path to the serialized model files and input data files -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # Path to the pretrained model checkpoint - modelpath = Path("models/conve.pt") - - # Entity and relation embedding dimensions - EMB_DIM = 300 - - # Top K vals to consider from the predictions - TOP_K = 2 - - # Required for OpenVINO conversion - output_dir = Path("models") - base_model_name = "conve" - - output_dir.mkdir(exist_ok=True) - - # Paths where PyTorch and OpenVINO IR models will be stored - ir_path = Path(output_dir / base_model_name).with_suffix(".xml") - -.. code:: ipython3 - - data_folder = Path("data") - - entdatapath = data_folder / "kg_training_entids.txt" - - if not entdatapath.exists(): - # Download the file containing the entities and entity IDs - entdatapath = download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/text/countries_S1/kg_training_entids.txt", - directory=data_folder, - ) - - reldatapath = data_folder / "kg_training_relids.txt" - - if not reldatapath.exists(): - # Download the file containing the relations and relation IDs - reldatapath = download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/text/countries_S1/kg_training_relids.txt", - directory=data_folder, - ) - - testdatapath = data_folder / "e1rel_to_e2_ranking_test.json" - - if not testdatapath.exists(): - # Download the test data file - testdatapath = download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/json/countries_S1/e1rel_to_e2_ranking_test.json", - directory=data_folder, - ) - - - -.. parsed-literal:: - - kg_training_entids.txt: 0%| | 0.00/3.79k [00:00 - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'e1'!. This input will be filled with random values! - [ WARNING ] No input files were given for input 'rel'!. This input will be filled with random values! - [ INFO ] Fill input 'e1' with random values - [ INFO ] Fill input 'rel' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 10000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 1.50 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 94836 iterations - [ INFO ] Duration: 10001.91 ms - [ INFO ] Latency: - [ INFO ] Median: 1.06 ms - [ INFO ] Average: 1.09 ms - [ INFO ] Min: 0.76 ms - [ INFO ] Max: 99.96 ms - [ INFO ] Throughput: 9481.79 FPS - - -Conclusions -~~~~~~~~~~~ - - - -In this notebook, we convert the trained PyTorch knowledge graph -embeddings model to the OpenVINO format. We confirm that there are no -accuracy differences post conversion. We also perform a sample -evaluation on the knowledge graph. Then, we determine the platform -specific speedup in runtime performance that can be obtained through -OpenVINO graph optimizations. To learn more about the OpenVINO -performance optimizations, refer to: -https://docs.openvino.ai/2024/openvino-workflow/running-inference/optimize-inference.html - -References -~~~~~~~~~~ - - - -1. Convolutional 2D Knowledge Graph Embeddings, Tim Dettmers et - al. (https://arxiv.org/abs/1707.01476) -2. Model implementation: https://github.com/TimDettmers/ConvE - -The ConvE model implementation used in this notebook is licensed under -the MIT License. The license is displayed below: MIT License - -Copyright (c) 2017 Tim Dettmers - -Permission is hereby granted, free of charge, to any person obtaining a -copy of this software and associated documentation files (the -“Software”), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be included -in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS -OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output.rst b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output.rst deleted file mode 100644 index 377534d7013a56..00000000000000 --- a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output.rst +++ /dev/null @@ -1,1495 +0,0 @@ -Kosmos-2: Multimodal Large Language Model and OpenVINO -====================================================== - -`KOSMOS-2 `__ -is a multimodal large language model (MLLM) that has new capabilities of -multimodal grounding and referring. KOSMOS-2 can understand multimodal -input, follow instructions, perceive object descriptions (e.g., bounding -boxes), and ground language to the visual world. - -Multimodal Large Language Models (MLLMs) have successfully played a role -as a general-purpose interface across a wide range of tasks, such as -language, vision, and vision-language tasks. MLLMs can perceive general -modalities, including texts, images, and audio, and generate responses -using free-form texts under zero-shot and few-shot settings. - -`In this work `__, authors unlock the -grounding capability for multimodal large language models. Grounding -capability can provide a more convenient and efficient human-AI -interaction for vision-language tasks. It enables the user to point to -the object or region in the image directly rather than input detailed -text descriptions to refer to it, the model can understand that image -region with its spatial locations. Grounding capability also enables the -model to respond with visual answers (i.e., bounding boxes), which can -support more vision-language tasks such as referring expression -comprehension. Visual answers are more accurate and resolve the -coreference ambiguity compared with text-only responses. In addition, -grounding capability can link noun phrases and referring expressions in -the generated free-form text response to the image regions, providing -more accurate, informational, and comprehensive answers. - -.. figure:: https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/annotated_snowman.jpg - :alt: image - - image - - -**Table of contents:** - - -- `Install requirements <#install-requirements>`__ -- `Original model inference <#original-model-inference>`__ -- `Convert models to OpenVINO Intermediate representation (IR) - format <#convert-models-to-openvino-intermediate-representation-ir-format>`__ - - - `Convert the vision model <#convert-the-vision-model>`__ - - `Convert Image To Text Projection - model <#convert-image-to-text-projection-model>`__ - - `Convert Text model <#convert-text-model>`__ - -- `Compiling models and prepare - pipeline <#compiling-models-and-prepare-pipeline>`__ -- `Inference <#inference>`__ -- `Quantization <#quantization>`__ - - - `Prepare calibration datasets <#prepare-calibration-datasets>`__ - - `Run quantization <#run-quantization>`__ - - `Run Weights Compression <#run-weights-compression>`__ - - `Compare model file sizes <#compare-model-file-sizes>`__ - - `Compare inference time of the FP32 and optimized - pipelines <#compare-inference-time-of-the-fp32-and-optimized-pipelines>`__ - -- `Interactive inference <#interactive-inference>`__ - - - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Install requirements --------------------- - - - -.. code:: ipython3 - - %pip install --upgrade pip - %pip install -q "openvino>=2024.0.0" "nncf>=2.11.0" "datasets>=2.20.0" - %pip install -q "transformers>=4.35" Pillow "gradio>=4.19" opencv-python "matplotlib>=3.4" - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu torch torchvision - - -.. parsed-literal:: - - Requirement already satisfied: pip in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (25.0) - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - - -Original model inference ------------------------- - - - -Let’s take the `original -example `__ - -.. code:: ipython3 - - import requests - from pathlib import Path - - from PIL import Image - from transformers import AutoProcessor, AutoModelForVision2Seq - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("kosmos2-multimodal-large-language-model.ipynb") - - model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224") - processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224") - - prompt = "An image of" # is used to prompt the model to generate locations tokens - - input_image_path = Path("snowman.png") - if not input_image_path.exists(): - url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.png" - image = Image.open(requests.get(url, stream=True).raw) - image.save(input_image_path) - - # The original Kosmos-2 demo saves the image first then reload it. For some images, this will give slightly different image input and change the generation outputs. - image = Image.open(input_image_path) - - inputs = processor(text=prompt, images=image, return_tensors="pt") - - generated_ids = model.generate( - pixel_values=inputs["pixel_values"], - input_ids=inputs["input_ids"], - attention_mask=inputs["attention_mask"], - image_embeds=None, - image_embeds_position_mask=inputs["image_embeds_position_mask"], - use_cache=True, - max_new_tokens=128, - ) - - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - - # Specify `cleanup_and_extract=False` in order to see the raw model generation. - processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False) - print(f"Raw model generation: {processed_text}") - # ` An image of a snowman warming himself by a fire.` - - # By default, the generated text is cleanup and the entities are extracted. - processed_text, entities = processor.post_process_generation(generated_text) - - print(f"Cleaned up generated text: {processed_text=}") - # `An image of a snowman warming himself by a fire.` - - print(f"Extracted entities: {entities}") - # `[('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]` - - -.. parsed-literal:: - - 2025-02-04 02:47:22.531266: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2025-02-04 02:47:22.565281: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2025-02-04 02:47:23.225923: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - - -.. parsed-literal:: - - Raw model generation: An image of a snowman warming himself by a campfire. - Cleaned up generated text: processed_text='An image of a snowman warming himself by a campfire.' - Extracted entities: [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a campfire', (41, 51), [(0.109375, 0.640625, 0.546875, 0.984375)])] - - -Once you have the entities, you can use the following helper function to -draw their bounding bboxes on the image: - -.. code:: ipython3 - - import cv2 - import numpy as np - - from PIL import Image - - - def is_overlapping(rect1, rect2): - x1, y1, x2, y2 = rect1 - x3, y3, x4, y4 = rect2 - return not (x2 < x3 or x1 > x4 or y2 < y3 or y1 > y4) - - - def draw_entity_boxes_on_image(image, entities): - """_summary_ - Args: - image (_type_): image or image path - collect_entity_location (_type_): _description_ - """ - if isinstance(image, Image.Image): - image_h = image.height - image_w = image.width - image = np.array(image)[:, :, [2, 1, 0]] - else: - raise ValueError(f"invaild image format, {type(image)} for {image}") - - if len(entities) == 0: - return image - - new_image = image.copy() - previous_bboxes = [] - # size of text - text_size = 1 - # thickness of text - text_line = 1 # int(max(1 * min(image_h, image_w) / 512, 1)) - box_line = 3 - (c_width, text_height), _ = cv2.getTextSize("F", cv2.FONT_HERSHEY_COMPLEX, text_size, text_line) - base_height = int(text_height * 0.675) - text_offset_original = text_height - base_height - text_spaces = 3 - - for entity_name, (start, end), bboxes in entities: - for x1_norm, y1_norm, x2_norm, y2_norm in bboxes: - orig_x1, orig_y1, orig_x2, orig_y2 = ( - int(x1_norm * image_w), - int(y1_norm * image_h), - int(x2_norm * image_w), - int(y2_norm * image_h), - ) - # draw bbox - # random color - color = tuple(np.random.randint(0, 255, size=3).tolist()) - new_image = cv2.rectangle(new_image, (orig_x1, orig_y1), (orig_x2, orig_y2), color, box_line) - - l_o, r_o = box_line // 2 + box_line % 2, box_line // 2 + box_line % 2 + 1 - - x1 = orig_x1 - l_o - y1 = orig_y1 - l_o - - if y1 < text_height + text_offset_original + 2 * text_spaces: - y1 = orig_y1 + r_o + text_height + text_offset_original + 2 * text_spaces - x1 = orig_x1 + r_o - - # add text background - (text_width, text_height), _ = cv2.getTextSize(f" {entity_name}", cv2.FONT_HERSHEY_COMPLEX, text_size, text_line) - text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2 = ( - x1, - y1 - (text_height + text_offset_original + 2 * text_spaces), - x1 + text_width, - y1, - ) - - for prev_bbox in previous_bboxes: - while is_overlapping((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2), prev_bbox): - text_bg_y1 += text_height + text_offset_original + 2 * text_spaces - text_bg_y2 += text_height + text_offset_original + 2 * text_spaces - y1 += text_height + text_offset_original + 2 * text_spaces - - if text_bg_y2 >= image_h: - text_bg_y1 = max( - 0, - image_h - (text_height + text_offset_original + 2 * text_spaces), - ) - text_bg_y2 = image_h - y1 = image_h - break - - alpha = 0.5 - for i in range(text_bg_y1, text_bg_y2): - for j in range(text_bg_x1, text_bg_x2): - if i < image_h and j < image_w: - if j < text_bg_x1 + 1.35 * c_width: - # original color - bg_color = color - else: - # white - bg_color = [255, 255, 255] - new_image[i, j] = (alpha * new_image[i, j] + (1 - alpha) * np.array(bg_color)).astype(np.uint8) - - cv2.putText( - new_image, - f" {entity_name}", - (x1, y1 - text_offset_original - 1 * text_spaces), - cv2.FONT_HERSHEY_COMPLEX, - text_size, - (0, 0, 0), - text_line, - cv2.LINE_AA, - ) - # previous_locations.append((x1, y1)) - previous_bboxes.append((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2)) - - pil_image = Image.fromarray(new_image[:, :, [2, 1, 0]]) - - return pil_image - -.. code:: ipython3 - - # Draw the bounding bboxes - new_image = draw_entity_boxes_on_image(image, entities) - display(new_image) - - - -.. image:: kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.png - - -Convert models to OpenVINO Intermediate representation (IR) format ------------------------------------------------------------------- - - - -The original model includes 3 models: vision model -``Kosmos2VisionModel``, ``Kosmos2ImageToTextProjection`` that is the -layer that transforms the image model’s output to part of the text -model’s input (namely, image features), and transformer based text model -``Kosmos2TextForCausalLM``. We will convert all of them and then replace -the original models. - -Define paths for converted models: - -.. code:: ipython3 - - from pathlib import Path - - - models_base_folder = Path("models") - VISION_MODEL_IR_PATH = models_base_folder / "vision_model.xml" - IMAGE_TO_TEXT_PROJECTION_MODEL_IR_PATH = models_base_folder / "image_to_text_projection_model.xml" - FIRST_STAGE_MODEL_PATH = models_base_folder / "kosmos_input_embed.xml" - SECOND_STAGE_MODEL_PATH = models_base_folder / "kosmos_with_past.xml" - -Define the conversion function for PyTorch modules. We use -``ov.convert_model`` function to obtain OpenVINO Intermediate -Representation object and ``ov.save_model`` function to save it as XML -file. - -.. code:: ipython3 - - import gc - - import torch - - import openvino as ov - - - def cleanup_torchscript_cache(): - # cleanup memory - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - - gc.collect() - - - def convert(model: torch.nn.Module, xml_path: str, example_input): - xml_path = Path(xml_path) - if not xml_path.exists(): - xml_path.parent.mkdir(parents=True, exist_ok=True) - with torch.no_grad(): - converted_model = ov.convert_model(model, example_input=example_input) - ov.save_model(converted_model, xml_path, compress_to_fp16=False) - - cleanup_torchscript_cache() - -Convert the vision model -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Vision model accept ``pixel_values`` and returns ``image_embeds``. - -.. code:: ipython3 - - convert(model.vision_model, VISION_MODEL_IR_PATH, inputs["pixel_values"]) - - -.. parsed-literal:: - - WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11. - - -.. parsed-literal:: - - [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead - warnings.warn( - `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:452: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:519: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:559: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): - - -Convert Image To Text Projection model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from torch import nn - - - def get_image_embeds(pixel_values): - vision_model_output = model.vision_model(pixel_values) - image_embeds = model.vision_model.model.post_layernorm(vision_model_output[0]) - image_embeds = nn.functional.normalize(image_embeds, dim=-1) - - return image_embeds - - - image_embeds = get_image_embeds(inputs["pixel_values"]) - convert(model.image_to_text_projection, IMAGE_TO_TEXT_PROJECTION_MODEL_IR_PATH, image_embeds) - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:168: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at aten/src/ATen/core/TensorBody.h:489.) - if a.grad is not None: - - -Convert Text model -~~~~~~~~~~~~~~~~~~ - - - -The Text Model performs in generation pipeline and we can separate it -into two stage. In the first stage the model transforms ``image_embeds`` -into output for the second stage. In the second stage the model produces -tokens during several runs that can be transformed into raw model -generated text by ``AutoProcessor``. - -.. code:: ipython3 - - from typing import Optional, List - - from transformers.models.kosmos2.modeling_kosmos2 import ( - create_position_ids_from_input_ids, - ) - - - def get_projecton_image_embeds(pixel_values): - vision_model_output = model.vision_model(pixel_values) - image_embeds = model.vision_model.model.post_layernorm(vision_model_output[0]) - image_embeds = nn.functional.normalize(image_embeds, dim=-1) - image_embeds, _ = model.image_to_text_projection(image_embeds) - - return image_embeds - - - def flattenize_inputs(inputs): - """ - Helper function for making nested inputs flattens - """ - flatten_inputs = [] - for input_data in inputs: - if input_data is None: - continue - if isinstance(input_data, (list, tuple)): - flatten_inputs.extend(flattenize_inputs(input_data)) - else: - flatten_inputs.append(input_data) - return flatten_inputs - - - def postprocess_converted_model( - ov_model, - example_input=None, - input_names=None, - output_names=None, - dynamic_shapes=None, - ): - """ - Helper function for appling postprocessing on converted model with updating input names, shapes and output names - acording to requested specification - """ - - flatten_example_inputs = flattenize_inputs(example_input) if example_input else [] - if input_names: - for inp_name, m_input, input_data in zip(input_names, ov_model.inputs, flatten_example_inputs): - m_input.get_tensor().set_names({inp_name}) - - if output_names: - for out, out_name in zip(ov_model.outputs, output_names): - out.get_tensor().set_names({out_name}) - - return ov_model - - - def convert_text_model(): - model.text_model.model.config.torchscript = True - model.text_model.config.torchscript = True - image_embeds = get_projecton_image_embeds(inputs["pixel_values"]) - conv_inputs = { - "input_ids": inputs["input_ids"], - "attention_mask": inputs["attention_mask"], - "image_embeds": image_embeds, - "image_embeds_position_mask": inputs["image_embeds_position_mask"], - } - outs = model.text_model.model(**conv_inputs) - inputs_ = ["input_ids", "attention_mask"] - outputs = ["logits"] - dynamic_shapes = { - "input_ids": {1: "seq_len"}, - "attention_mask": {1: "seq_len"}, - "position_ids": {0: "seq_len"}, - } - for idx in range(len(outs[1])): - inputs_.extend([f"past_key_values.{idx}.key", f"past_key_values.{idx}.value"]) - dynamic_shapes[inputs_[-1]] = {2: "past_sequence + sequence"} - dynamic_shapes[inputs_[-2]] = {2: "past_sequence + sequence"} - outputs.extend([f"present.{idx}.key", f"present.{idx}.value"]) - - if not FIRST_STAGE_MODEL_PATH.exists(): - ov_model = ov.convert_model(model.text_model.model, example_input=conv_inputs) - ov_model = postprocess_converted_model(ov_model, output_names=outputs) - ov.save_model(ov_model, FIRST_STAGE_MODEL_PATH) - del ov_model - cleanup_torchscript_cache() - - if not SECOND_STAGE_MODEL_PATH.exists(): - position_ids = create_position_ids_from_input_ids( - inputs["input_ids"], - padding_idx=model.text_model.config.pad_token_id, - past_key_values_length=0, - )[:, -1:] - - example_input_second_stage = { - "input_ids": inputs["input_ids"][:, -1:], - "attention_mask": inputs["input_ids"].new_ones(1, inputs["input_ids"].shape[1] + 1), - "position_ids": position_ids, - "past_key_values": outs[1], - } - - ov_model = ov.convert_model(model.text_model.model, example_input=example_input_second_stage) - ov_model = postprocess_converted_model( - ov_model, - example_input=example_input_second_stage.values(), - input_names=inputs_, - output_names=outputs, - dynamic_shapes=dynamic_shapes, - ) - ov.save_model(ov_model, SECOND_STAGE_MODEL_PATH) - del ov_model - cleanup_torchscript_cache() - - - convert_text_model() - - -.. parsed-literal:: - - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:859: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if max_pos > self.weights.size(0): - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:1168: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if input_shape[-1] > 1: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:975: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if attention_mask.size() != (batch_size, 1, seq_length, src_len): - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/875/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:1261: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if past_key_values_length > 0: - - -Compiling models and prepare pipeline -------------------------------------- - - - -Select device that will be used to do models inference using OpenVINO -from the dropdown list: - -.. code:: ipython3 - - from notebook_utils import device_widget - - - core = ov.Core() - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Let’s create callable wrapper classes for compiled models to allow -interaction with original pipeline. Note that all of wrapper classes -return ``torch.Tensor``\ s instead of ``np.array``\ s. - -.. code:: ipython3 - - class WraperInternalVisionModel: - post_layernorm = model.vision_model.model.post_layernorm - - - class VisionModelWrapper(torch.nn.Module): - def __init__(self, model_ir_path): - super().__init__() - self.model = WraperInternalVisionModel() - self.vision_model = core.compile_model(model_ir_path, device.value) - - def forward(self, pixel_values, **kwargs): - vision_model_output = self.vision_model(pixel_values)[0] - - return [torch.from_numpy(vision_model_output)] - - - class ImageToTextProjectionModelWrapper(torch.nn.Module): - def __init__(self, model_ir_path): - super().__init__() - self.image_to_text_projection = core.compile_model(model_ir_path, device.value) - - def forward(self, image_embeds): - output = self.image_to_text_projection(image_embeds) - image_embeds = output[0] - projection_attentions = output[1] - return image_embeds, projection_attentions - -.. code:: ipython3 - - from transformers.generation import GenerationConfig, GenerationMixin - from transformers.models.kosmos2.modeling_kosmos2 import ( - Kosmos2ForConditionalGenerationModelOutput, - ) - - - class KosmosForCausalLMWrapper(GenerationMixin): - def __init__(self, first_stage_model_path, second_stage_model_path, device): - self.model_stage_1 = core.compile_model(first_stage_model_path, device.value) - self.model_stage_2 = core.read_model(second_stage_model_path) - self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model_stage_2.inputs)} - self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model_stage_2.outputs)} - self.key_value_input_names = [key for key in self.input_names if "key_values" in key] - self.key_value_output_names = [key for key in self.output_names if "present" in key] - self.model_stage_2 = core.compile_model(self.model_stage_2, device.value) - - self.request = self.model_stage_2.create_infer_request() - self.config = model.config - self.generation_config = GenerationConfig.from_model_config(model.config) - self.main_input_name = "input_ids" - self.device = torch.device("cpu") - self.num_pkv = 2 - self.lm_head = nn.Linear( - in_features=model.text_model.config.embed_dim, - out_features=model.text_model.config.vocab_size, - bias=False, - ) - self._supports_cache_class = False - - def get_input_embeddings(self) -> nn.Module: - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self) -> nn.Module: - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def can_generate(self): - """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate.""" - return True - - def __call__( - self, - input_ids, - attention_mask: Optional[torch.Tensor] = None, - image_embeds: Optional[torch.Tensor] = None, - image_embeds_position_mask: Optional[torch.Tensor] = None, - position_ids=None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - **kwargs, - ): - return self.forward( - input_ids, - attention_mask, - image_embeds, - image_embeds_position_mask, - position_ids, - past_key_values, - ) - - def forward( - self, - input_ids, - attention_mask: Optional[torch.Tensor] = None, - image_embeds: Optional[torch.Tensor] = None, - image_embeds_position_mask: Optional[torch.Tensor] = None, - position_ids=None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - **kwargs, - ): - if past_key_values is None: - outs = self.model_stage_1( - { - "input_ids": input_ids, - "attention_mask": attention_mask, - "image_embeds": image_embeds, - "image_embeds_position_mask": image_embeds_position_mask, - } - ) - lm_logits = model.text_model.lm_head(torch.from_numpy(outs[0])) - - pkv = list(outs.values())[1:] - pkv = tuple(pkv[i : i + 2] for i in range(0, len(pkv), 2)) - - return Kosmos2ForConditionalGenerationModelOutput(logits=lm_logits, past_key_values=pkv) - - if past_key_values is not None: - past_key_values = tuple(past_key_value for pkv_per_layer in past_key_values for past_key_value in pkv_per_layer) - inputs_ = { - "input_ids": input_ids[:, -1].unsqueeze(-1), - "attention_mask": attention_mask, - "position_ids": position_ids, - } - inputs_.update(dict(zip(self.key_value_input_names, past_key_values))) - - # Run inference - self.request.start_async(inputs_, share_inputs=True) - self.request.wait() - - logits = torch.from_numpy(self.request.get_tensor("logits").data) - logits = model.text_model.lm_head(logits) - - # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer) - past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names) - # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention) - - past_key_values = tuple(past_key_values[i : i + self.num_pkv] for i in range(0, len(past_key_values), self.num_pkv)) - - return Kosmos2ForConditionalGenerationModelOutput(logits=logits, past_key_values=past_key_values) - - def prepare_inputs_for_generation( - self, - input_ids, - image_embeds=None, - image_embeds_position_mask=None, - past_key_values=None, - attention_mask=None, - use_cache=None, - **kwargs, - ): - input_shape = input_ids.shape - # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly - if attention_mask is None: - attention_mask = input_ids.new_ones(input_shape) - - position_ids = None - - # cut input_ids if past_key_values is used - if past_key_values is not None: - position_ids = create_position_ids_from_input_ids( - input_ids, - padding_idx=model.text_model.config.pad_token_id, - past_key_values_length=0, - )[:, -1:] - - input_ids = input_ids[:, -1:] - image_embeds = None - image_embeds_position_mask = None - elif image_embeds_position_mask is not None: - batch_size, seq_len = input_ids.size() - mask_len = image_embeds_position_mask.size()[-1] - image_embeds_position_mask = torch.cat( - ( - image_embeds_position_mask, - torch.zeros( - size=(batch_size, seq_len - mask_len), - dtype=torch.bool, - device=input_ids.device, - ), - ), - dim=1, - ) - - return { - "input_ids": input_ids, - "image_embeds": image_embeds, - "image_embeds_position_mask": image_embeds_position_mask, - "position_ids": position_ids, - "past_key_values": past_key_values, - "attention_mask": attention_mask, - } - - @staticmethod - # Copied from transformers.models.umt5.modeling_umt5.UMT5ForConditionalGeneration._reorder_cache - def _reorder_cache(past_key_values, beam_idx): - reordered_past = () - for layer_past in past_key_values: - reordered_past += (tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),) - return reordered_past - - - class Kosmos2ForConditionalGenerationWrapper: - def __init__( - self, - vision_model_path, - image_to_text_projection_model_path, - first_stage_model_path, - second_stage_model_path, - device, - ): - self.vision_model = VisionModelWrapper(vision_model_path) - self.image_to_text_projection = ImageToTextProjectionModelWrapper(image_to_text_projection_model_path) - self.text_model = KosmosForCausalLMWrapper(first_stage_model_path, second_stage_model_path, device) - - def generate( - self, - pixel_values=None, - image_embeds_position_mask=None, - input_ids=None, - attention_mask=None, - image_embeds=None, - **kwargs, - ): - vision_model_output = self.vision_model(pixel_values) - image_embeds = model.vision_model.model.post_layernorm(vision_model_output[0]) - # normalized features - image_embeds = nn.functional.normalize(image_embeds, dim=-1) - image_embeds, projection_attentions = self.image_to_text_projection(image_embeds.detach().numpy()) - - output = self.text_model.generate( - input_ids, - attention_mask=attention_mask, - image_embeds=image_embeds, - image_embeds_position_mask=image_embeds_position_mask, - **kwargs, - ) - - return output - -.. code:: ipython3 - - ov_model = Kosmos2ForConditionalGenerationWrapper( - VISION_MODEL_IR_PATH, - IMAGE_TO_TEXT_PROJECTION_MODEL_IR_PATH, - FIRST_STAGE_MODEL_PATH, - SECOND_STAGE_MODEL_PATH, - device, - ) - -Inference ---------- - - - -.. code:: ipython3 - - def generate_entities(model): - generated_ids = model.generate( - pixel_values=inputs["pixel_values"], - input_ids=inputs["input_ids"], - attention_mask=inputs["attention_mask"], - image_embeds=None, - image_embeds_position_mask=inputs["image_embeds_position_mask"], - max_new_tokens=128, - ) - - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - - # Specify `cleanup_and_extract=False` in order to see the raw model generation. - processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False) - print(f"Raw model generation: {processed_text}") - # ` An image of a snowman warming himself by a fire.` - - # By default, the generated text is cleanup and the entities are extracted. - processed_text, entities = processor.post_process_generation(generated_text) - - print(f"Cleaned up generated text: {processed_text=}") - # `An image of a snowman warming himself by a fire.` - - print(f"Extracted entities: {entities}") - # `[('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]` - return entities - -.. code:: ipython3 - - entities = generate_entities(ov_model) - new_image = draw_entity_boxes_on_image(image, entities) - display(new_image) - - -.. parsed-literal:: - - Raw model generation: An image of a snowman warming himself by a campfire. - Cleaned up generated text: processed_text='An image of a snowman warming himself by a campfire.' - Extracted entities: [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a campfire', (41, 51), [(0.109375, 0.640625, 0.546875, 0.984375)])] - - - -.. image:: kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.png - - -Quantization ------------- - - - -`NNCF `__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -Please select below whether you would like to run quantization to -improve model inference speed. - - **NOTE**: Quantization is time and memory consuming operation. - Running quantization code below may take some time. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - to_quantize = quantization_widget() - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -Let’s load ``skip magic`` extension to skip quantization if -``to_quantize`` is not selected - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - import requests - - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - ov_optimized_model = None - - %load_ext skip_kernel_extension - -Prepare calibration datasets -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We use a portion of -`KoalaAI/StockImages-CC0 `__ -dataset from Hugging Face as calibration data. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - INT8_VISION_MODEL_IR_PATH = models_base_folder / "vision_model_int8.xml" - INT8_IMAGE_TO_TEXT_PROJECTION_MODEL_IR_PATH = models_base_folder / "image_to_text_projection_model_int8.xml" - INT4_FIRST_STAGE_MODEL_PATH = models_base_folder / "kosmos_input_embed_int4.xml" - INT4_SECOND_STAGE_MODEL_PATH = models_base_folder / "kosmos_with_past_int4.xml" - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import datasets - - prompt = "An image of" - subset_size = 200 - - dataset = datasets.load_dataset("KoalaAI/StockImages-CC0", split="train", streaming=True, trust_remote_code=True) - dataset = dataset.shuffle(seed=42).take(subset_size).select_columns(["image"]) - -To collect intermediate model inputs for calibration we should customize -``CompiledModel`` and ``InferRequest``. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from tqdm.notebook import tqdm - from transformers import set_seed - - set_seed(42) - - def collect_calibration_data(pipeline, dataset, subset_size): - calibration_dataset = { - "vision_model": [], - "image_to_text_proj": [], - } - for data in tqdm(dataset, total=subset_size, desc="Collecting calibration dataset"): - img = data["image"] - pixel_values = processor(text=prompt, images=img, return_tensors="pt")["pixel_values"] - vision_model_output = pipeline.vision_model(pixel_values) - image_embeds = model.vision_model.model.post_layernorm(vision_model_output[0]) - - image_embeds = nn.functional.normalize(image_embeds, dim=-1) - calibration_dataset["vision_model"].append(pixel_values) - calibration_dataset["image_to_text_proj"].append(image_embeds.detach().numpy()) - return calibration_dataset - -.. code:: ipython3 - - %%skip not $to_quantize.value - - if not (INT8_VISION_MODEL_IR_PATH.exists() and INT8_IMAGE_TO_TEXT_PROJECTION_MODEL_IR_PATH.exists()): - calibration_dataset = collect_calibration_data(ov_model, dataset, subset_size) - - - -.. parsed-literal:: - - Collecting calibration dataset: 0%| | 0/200 [00:00 An image of a snowman warming himself by a fire in the winter - Cleaned up generated text: processed_text='An image of a snowman warming himself by a fire in the winter' - Extracted entities: [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.203125, 0.046875, 0.453125, 0.859375)])] - - - -.. image:: kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_48_1.png - - -Compare model file sizes -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - fp32_model_paths = [VISION_MODEL_IR_PATH, IMAGE_TO_TEXT_PROJECTION_MODEL_IR_PATH, FIRST_STAGE_MODEL_PATH, SECOND_STAGE_MODEL_PATH] - int8_model_paths = [INT8_VISION_MODEL_IR_PATH, INT8_IMAGE_TO_TEXT_PROJECTION_MODEL_IR_PATH, INT4_FIRST_STAGE_MODEL_PATH, INT4_SECOND_STAGE_MODEL_PATH] - - for fp32_path, int8_path in zip(fp32_model_paths, int8_model_paths): - fp32_ir_model_size = fp32_path.with_suffix(".bin").stat().st_size - int8_model_size = int8_path.with_suffix(".bin").stat().st_size - print(f"{fp32_path.stem} compression rate: {fp32_ir_model_size / int8_model_size:.3f}") - - -.. parsed-literal:: - - vision_model compression rate: 3.956 - image_to_text_projection_model compression rate: 3.899 - kosmos_input_embed compression rate: 3.475 - kosmos_with_past compression rate: 3.475 - - -Compare inference time of the FP32 and optimized pipelines -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To measure the inference performance of the ``FP32`` and optimized -pipelines, we use mean inference time on 7 samples. - - **NOTE**: For the most accurate performance estimation, it is - recommended to run ``benchmark_app`` in a terminal/command prompt - after closing other applications. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import time - - def calculate_inference_time(pipeline, dataset): - inference_time = [] - for data in dataset.take(7): - img = data["image"] - inputs = processor(text=prompt, images=img, return_tensors="pt") - - start = time.perf_counter() - _ = pipeline.generate( - pixel_values=inputs["pixel_values"], - input_ids=inputs["input_ids"], - attention_mask=inputs["attention_mask"], - image_embeds=None, - image_embeds_position_mask=inputs["image_embeds_position_mask"], - max_new_tokens=128, - ) - - end = time.perf_counter() - delta = end - start - inference_time.append(delta) - return np.mean(inference_time) - -.. code:: ipython3 - - %%skip not $to_quantize.value - - fp_latency = calculate_inference_time(ov_model, dataset) - print(f"FP32 pipeline: {fp_latency:.3f} seconds") - int_latency = calculate_inference_time(ov_optimized_model, dataset) - print(f"Optimized pipeline: {int_latency:.3f} seconds") - print(f"Performance speed-up: {fp_latency / int_latency:.3f}") - - -.. parsed-literal:: - - FP32 pipeline: 2.716 seconds - Optimized pipeline: 1.177 seconds - Performance speed-up: 2.308 - - -Interactive inference ---------------------- - - - -Please select below whether you would like to use the quantized models -to launch the interactive demo. - -.. code:: ipython3 - - import ipywidgets as widgets - - quantized_models_present = ov_optimized_model is not None - - use_quantized_models = widgets.Checkbox( - value=quantized_models_present, - description="Use quantized models", - disabled=not quantized_models_present, - ) - - use_quantized_models - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Use quantized models') - - - -.. code:: ipython3 - - import gradio as gr - - pipeline = ov_optimized_model if use_quantized_models.value else ov_model - - - def generate(image, prompt, use_bbox, _=gr.Progress(track_tqdm=True)): - if use_bbox: - prompt = " " + prompt - inputs = processor(text=prompt, images=image, return_tensors="pt") - generated_ids_ = pipeline.generate( - pixel_values=inputs["pixel_values"], - input_ids=inputs["input_ids"], - attention_mask=inputs["attention_mask"], - image_embeds=None, - image_embeds_position_mask=inputs["image_embeds_position_mask"], - max_new_tokens=128, - ) - generated_text = processor.batch_decode(generated_ids_, skip_special_tokens=True)[0] - processed_text, entities = processor.post_process_generation(generated_text) - - new_image = draw_entity_boxes_on_image(Image.fromarray(image), entities) - - return new_image, processed_text - - - if not Path("gradio_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/kosmos2-multimodal-large-language-model/gradio_helper.py" - ) - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=generate) - - try: - demo.queue().launch(debug=False) - except Exception: - demo.queue().launch(debug=False, share=True) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ - - -.. parsed-literal:: - - Running on local URL: http://127.0.0.1:7860 - - To create a public link, set `share=True` in `launch()`. - - - - - - - diff --git a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.jpg b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.jpg deleted file mode 100644 index 54c6b2ade471a4..00000000000000 --- a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9fe356572bcfd5606b34764bc1b27d76da4d183ed2a97adb871954c6ddbdfa06 -size 116596 diff --git a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.png b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.png deleted file mode 100644 index 6fc99a5e729785..00000000000000 --- a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e6bab9d0190b5977eebc25da3da2bf3730534922086c334ad55664a40a8c36fd -size 1495569 diff --git a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_48_1.png b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_48_1.png deleted file mode 100644 index 6f3b58d2f19378..00000000000000 --- a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_48_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0e3657b663911aa0bc44576547d6225885defbb3b35c524f3fabed0b7420b9f4 -size 1432136 diff --git a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.jpg b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.jpg deleted file mode 100644 index 3219a7648ab49c..00000000000000 --- a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9aade49643152f532addddbbfdb4a54aec3666e871305f37c6e96ee0efd536e7 -size 119554 diff --git a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.png b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.png deleted file mode 100644 index 2040000f914c10..00000000000000 --- a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1b096107d8bf14416f3305efde5364dc9ad3e3473b2cad8a6c1e80d9ba5de072 -size 1495968 diff --git a/docs/notebooks/language-quantize-bert-with-output.rst b/docs/notebooks/language-quantize-bert-with-output.rst deleted file mode 100644 index c01a3420fa9c39..00000000000000 --- a/docs/notebooks/language-quantize-bert-with-output.rst +++ /dev/null @@ -1,712 +0,0 @@ -Quantize NLP models with Post-Training Quantization ​in NNCF -============================================================ - -This tutorial demonstrates how to apply ``INT8`` quantization to the -Natural Language Processing model known as -`BERT `__, using -the `Post-Training Quantization -API `__ -(NNCF library). A fine-tuned `HuggingFace -BERT `__ -`PyTorch `__ model, trained on the `Microsoft -Research Paraphrase Corpus -(MRPC) `__, -will be used. The tutorial is designed to be extendable to custom models -and datasets. It consists of the following steps: - -- Download and prepare the BERT model and MRPC dataset. -- Define data loading and accuracy validation functionality. -- Prepare the model for quantization. -- Run optimization pipeline. -- Load and test quantized model. -- Compare the performance of the original, converted and quantized - models. - - -**Table of contents:** - - -- `Imports <#imports>`__ -- `Settings <#settings>`__ -- `Prepare the Model <#prepare-the-model>`__ -- `Prepare the Dataset <#prepare-the-dataset>`__ -- `Optimize model using NNCF Post-training Quantization - API <#optimize-model-using-nncf-post-training-quantization-api>`__ -- `Load and Test OpenVINO Model <#load-and-test-openvino-model>`__ - - - `Select inference device <#select-inference-device>`__ - -- `Compare F1-score of FP32 and INT8 - models <#compare-f1-score-of-fp32-and-int8-models>`__ -- `Compare Performance of the Original, Converted and Quantized - Models <#compare-performance-of-the-original-converted-and-quantized-models>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - %pip install -q "nncf>=2.5.0" - %pip install -q torch transformers "torch>=2.1" datasets evaluate tqdm --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "openvino>=2023.1.0" - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - - -Imports -------- - - - -.. code:: ipython3 - - import time - from pathlib import Path - from zipfile import ZipFile - from typing import Iterable - from typing import Any - - import datasets - import evaluate - import numpy as np - import nncf - from nncf.parameters import ModelType - import openvino as ov - import torch - from transformers import BertForSequenceClassification, BertTokenizer - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("language-quantize-bert.ipynb") - - -.. parsed-literal:: - - 2025-02-04 02:54:07.064767: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2025-02-04 02:54:07.099737: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2025-02-04 02:54:07.654286: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -Settings --------- - - - -.. code:: ipython3 - - # Set the data and model directories, source URL and the filename of the model. - DATA_DIR = Path("data") - MODEL_DIR = Path("model") - MODEL_LINK = "https://download.pytorch.org/tutorial/MRPC.zip" - FILE_NAME = MODEL_LINK.split("/")[-1] - PRETRAINED_MODEL_DIR = MODEL_DIR / "MRPC" - - DATA_DIR.mkdir(exist_ok=True) - MODEL_DIR.mkdir(exist_ok=True) - -Prepare the Model ------------------ - - - -Perform the following: - -- Download and unpack pre-trained BERT model for MRPC by PyTorch. -- Convert the model to the OpenVINO Intermediate Representation - (OpenVINO IR) - -.. code:: ipython3 - - if not PRETRAINED_MODEL_DIR.exists(): - download_file(MODEL_LINK, directory=MODEL_DIR, show_progress=True) - with ZipFile(f"{MODEL_DIR}/{FILE_NAME}", "r") as zip_ref: - zip_ref.extractall(MODEL_DIR) - - - -.. parsed-literal:: - - MRPC.zip: 0%| | 0.00/387M [00:00`__ dataset for the MRPC task from -HuggingFace datasets. Then, we tokenize the data with a pre-trained BERT -tokenizer from HuggingFace. - -.. code:: ipython3 - - def create_data_source(): - raw_dataset = datasets.load_dataset("glue", "mrpc", split="validation") - tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_DIR) - - def _preprocess_fn(examples): - texts = (examples["sentence1"], examples["sentence2"]) - result = tokenizer(*texts, padding="max_length", max_length=MAX_SEQ_LENGTH, truncation=True) - result["labels"] = examples["label"] - return result - - processed_dataset = raw_dataset.map(_preprocess_fn, batched=True, batch_size=1) - - return processed_dataset - - - data_source = create_data_source() - -Optimize model using NNCF Post-training Quantization API --------------------------------------------------------- - - - -`NNCF `__ provides a suite of -advanced algorithms for Neural Networks inference optimization in -OpenVINO with minimal accuracy drop. We will use 8-bit quantization in -post-training mode (without the fine-tuning pipeline) to optimize BERT. - -The optimization process contains the following steps: - -1. Create a Dataset for quantization -2. Run ``nncf.quantize`` for getting an optimized model -3. Serialize OpenVINO IR model using ``openvino.save_model`` function - -.. code:: ipython3 - - INPUT_NAMES = [key for key in inputs.keys()] - - - def transform_fn(data_item): - """ - Extract the model's input from the data item. - The data item here is the data item that is returned from the data source per iteration. - This function should be passed when the data item cannot be used as model's input. - """ - inputs = {name: np.asarray([data_item[name]], dtype=np.int64) for name in INPUT_NAMES} - return inputs - - - calibration_dataset = nncf.Dataset(data_source, transform_fn) - # Quantize the model. By specifying model_type, we specify additional transformer patterns in the model. - quantized_model = nncf.quantize(model, calibration_dataset, model_type=ModelType.TRANSFORMER) - - - -.. parsed-literal:: - - Output() - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - -.. code:: ipython3 - - compressed_model_xml = Path(MODEL_DIR) / "quantized_bert_mrpc.xml" - ov.save_model(quantized_model, compressed_model_xml) - -Load and Test OpenVINO Model ----------------------------- - - - -To load and test converted model, perform the following: - -- Load the model and compile it for selected device. -- Prepare the input. -- Run the inference. -- Get the answer from the model output. - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - # Compile the model for a specific device. - compiled_quantized_model = core.compile_model(model=quantized_model, device_name=device.value) - output_layer = compiled_quantized_model.outputs[0] - -The Data Source returns a pair of sentences (indicated by -``sample_idx``) and the inference compares these sentences and outputs -whether their meaning is the same. You can test other sentences by -changing ``sample_idx`` to another value (from 0 to 407). - -.. code:: ipython3 - - sample_idx = 5 - sample = data_source[sample_idx] - inputs = {k: torch.unsqueeze(torch.tensor(sample[k]), 0) for k in ["input_ids", "token_type_ids", "attention_mask"]} - - result = compiled_quantized_model(inputs)[output_layer] - result = np.argmax(result) - - print(f"Text 1: {sample['sentence1']}") - print(f"Text 2: {sample['sentence2']}") - print(f"The same meaning: {'yes' if result == 1 else 'no'}") - - -.. parsed-literal:: - - Text 1: Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed . - Text 2: It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status . - The same meaning: yes - - -Compare F1-score of FP32 and INT8 models ----------------------------------------- - - - -.. code:: ipython3 - - def validate(model: ov.Model, dataset: Iterable[Any]) -> float: - """ - Evaluate the model on GLUE dataset. - Returns F1 score metric. - """ - compiled_model = core.compile_model(model, device_name=device.value) - output_layer = compiled_model.output(0) - - metric = evaluate.load("glue", "mrpc") - for batch in dataset: - inputs = [np.expand_dims(np.asarray(batch[key], dtype=np.int64), 0) for key in INPUT_NAMES] - outputs = compiled_model(inputs)[output_layer] - predictions = outputs[0].argmax(axis=-1) - metric.add_batch(predictions=[predictions], references=[batch["labels"]]) - metrics = metric.compute() - f1_score = metrics["f1"] - - return f1_score - - - print("Checking the accuracy of the original model:") - metric = validate(model, data_source) - print(f"F1 score: {metric:.4f}") - - print("Checking the accuracy of the quantized model:") - metric = validate(quantized_model, data_source) - print(f"F1 score: {metric:.4f}") - - -.. parsed-literal:: - - Checking the accuracy of the original model: - F1 score: 0.9019 - Checking the accuracy of the quantized model: - F1 score: 0.8985 - - -Compare Performance of the Original, Converted and Quantized Models -------------------------------------------------------------------- - - - -Compare the original PyTorch model with OpenVINO converted and quantized -models (``FP32``, ``INT8``) to see the difference in performance. It is -expressed in Sentences Per Second (SPS) measure, which is the same as -Frames Per Second (FPS) for images. - -.. code:: ipython3 - - # Compile the model for a specific device. - compiled_model = core.compile_model(model=model, device_name=device.value) - -.. code:: ipython3 - - num_samples = 50 - sample = data_source[0] - inputs = {k: torch.unsqueeze(torch.tensor(sample[k]), 0) for k in ["input_ids", "token_type_ids", "attention_mask"]} - - with torch.no_grad(): - start = time.perf_counter() - for _ in range(num_samples): - torch_model(torch.vstack(list(inputs.values()))) - end = time.perf_counter() - time_torch = end - start - print(f"PyTorch model on CPU: {time_torch / num_samples:.3f} seconds per sentence, " f"SPS: {num_samples / time_torch:.2f}") - - start = time.perf_counter() - for _ in range(num_samples): - compiled_model(inputs) - end = time.perf_counter() - time_ir = end - start - print(f"IR FP32 model in OpenVINO Runtime/{device.value}: {time_ir / num_samples:.3f} " f"seconds per sentence, SPS: {num_samples / time_ir:.2f}") - - start = time.perf_counter() - for _ in range(num_samples): - compiled_quantized_model(inputs) - end = time.perf_counter() - time_ir = end - start - print(f"OpenVINO IR INT8 model in OpenVINO Runtime/{device.value}: {time_ir / num_samples:.3f} " f"seconds per sentence, SPS: {num_samples / time_ir:.2f}") - - -.. parsed-literal:: - - We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked. - - -.. parsed-literal:: - - PyTorch model on CPU: 0.068 seconds per sentence, SPS: 14.69 - IR FP32 model in OpenVINO Runtime/AUTO: 0.020 seconds per sentence, SPS: 49.27 - OpenVINO IR INT8 model in OpenVINO Runtime/AUTO: 0.009 seconds per sentence, SPS: 107.75 - - -Finally, measure the inference performance of OpenVINO ``FP32`` and -``INT8`` models. For this purpose, use `Benchmark -Tool `__ -in OpenVINO. - - **Note**: The ``benchmark_app`` tool is able to measure the - performance of the OpenVINO Intermediate Representation (OpenVINO IR) - models only. For more accurate performance, run ``benchmark_app`` in - a terminal/command prompt after closing other applications. Run - ``benchmark_app -m model.xml -d CPU`` to benchmark async inference on - CPU for one minute. Change ``CPU`` to ``GPU`` to benchmark on GPU. - Run ``benchmark_app --help`` to see an overview of all command-line - options. - -.. code:: ipython3 - - # Inference FP32 model (OpenVINO IR) - !benchmark_app -m $ir_model_xml -shape [1,128],[1,128],[1,128] -d {device.value} -api sync - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ WARNING ] Default duration 120 seconds is used for unknown device AUTO - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.LATENCY. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 18.95 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] input_ids (node: input_ids) : i64 / [...] / [1,?] - [ INFO ] attention_mask , 63 (node: attention_mask) : i64 / [...] / [1,?] - [ INFO ] token_type_ids (node: token_type_ids) : i64 / [...] / [1,?] - [ INFO ] Model outputs: - [ INFO ] logits (node: __module.classifier/aten::linear/Add) : f32 / [...] / [1,2] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'input_ids': [1,128], '63': [1,128], 'token_type_ids': [1,128] - [ INFO ] Reshape model took 5.61 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] input_ids (node: input_ids) : i64 / [...] / [1,128] - [ INFO ] attention_mask , 63 (node: attention_mask) : i64 / [...] / [1,128] - [ INFO ] token_type_ids (node: token_type_ids) : i64 / [...] / [1,128] - [ INFO ] Model outputs: - [ INFO ] logits (node: __module.classifier/aten::linear/Add) : f32 / [...] / [1,2] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 362.20 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.LATENCY - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: False - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 12 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 1 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1 - [ INFO ] PERFORMANCE_HINT: LATENCY - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'input_ids'!. This input will be filled with random values! - [ WARNING ] No input files were given for input '63'!. This input will be filled with random values! - [ WARNING ] No input files were given for input 'token_type_ids'!. This input will be filled with random values! - [ INFO ] Fill input 'input_ids' with random values - [ INFO ] Fill input '63' with random values - [ INFO ] Fill input 'token_type_ids' with random values - [Step 10/11] Measuring performance (Start inference synchronously, limits: 120000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 23.26 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 6165 iterations - [ INFO ] Duration: 120011.84 ms - [ INFO ] Latency: - [ INFO ] Median: 19.09 ms - [ INFO ] Average: 19.37 ms - [ INFO ] Min: 18.38 ms - [ INFO ] Max: 23.76 ms - [ INFO ] Throughput: 51.37 FPS - - -.. code:: ipython3 - - # Inference INT8 model (OpenVINO IR) - ! benchmark_app -m $compressed_model_xml -shape [1,128],[1,128],[1,128] -d {device.value} -api sync - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ WARNING ] Default duration 120 seconds is used for unknown device AUTO - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.LATENCY. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 24.96 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] input_ids (node: input_ids) : i64 / [...] / [1,?] - [ INFO ] attention_mask , 63 (node: attention_mask) : i64 / [...] / [1,?] - [ INFO ] token_type_ids (node: token_type_ids) : i64 / [...] / [1,?] - [ INFO ] Model outputs: - [ INFO ] logits (node: __module.classifier/aten::linear/Add) : f32 / [...] / [1,2] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'input_ids': [1,128], '63': [1,128], 'token_type_ids': [1,128] - [ INFO ] Reshape model took 7.31 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] input_ids (node: input_ids) : i64 / [...] / [1,128] - [ INFO ] attention_mask , 63 (node: attention_mask) : i64 / [...] / [1,128] - [ INFO ] token_type_ids (node: token_type_ids) : i64 / [...] / [1,128] - [ INFO ] Model outputs: - [ INFO ] logits (node: __module.classifier/aten::linear/Add) : f32 / [...] / [1,2] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 1043.47 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.LATENCY - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: False - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 12 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 1 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1 - [ INFO ] PERFORMANCE_HINT: LATENCY - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'input_ids'!. This input will be filled with random values! - [ WARNING ] No input files were given for input '63'!. This input will be filled with random values! - [ WARNING ] No input files were given for input 'token_type_ids'!. This input will be filled with random values! - [ INFO ] Fill input 'input_ids' with random values - [ INFO ] Fill input '63' with random values - [ INFO ] Fill input 'token_type_ids' with random values - [Step 10/11] Measuring performance (Start inference synchronously, limits: 120000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 16.84 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 13546 iterations - [ INFO ] Duration: 120003.26 ms - [ INFO ] Latency: - [ INFO ] Median: 8.70 ms - [ INFO ] Average: 8.77 ms - [ INFO ] Min: 7.64 ms - [ INFO ] Max: 13.31 ms - [ INFO ] Throughput: 112.88 FPS - diff --git a/docs/notebooks/latent-consistency-models-image-generation-with-output.rst b/docs/notebooks/latent-consistency-models-image-generation-with-output.rst deleted file mode 100644 index cc23e35f09d71a..00000000000000 --- a/docs/notebooks/latent-consistency-models-image-generation-with-output.rst +++ /dev/null @@ -1,847 +0,0 @@ -Image generation with Latent Consistency Model and OpenVINO -=========================================================== - -LCMs: The next generation of generative models after Latent Diffusion -Models (LDMs). Latent Diffusion models (LDMs) have achieved remarkable -results in synthesizing high-resolution images. However, the iterative -sampling is computationally intensive and leads to slow generation. - -Inspired by `Consistency Models `__, -`Latent Consistency Models `__ -(LCMs) were proposed, enabling swift inference with minimal steps on any -pre-trained LDMs, including Stable Diffusion. The `Consistency Model -(CM) (Song et al., 2023) `__ is a new -family of generative models that enables one-step or few-step -generation. The core idea of the CM is to learn the function that maps -any points on a trajectory of the PF-ODE (probability flow of `ordinary -differential -equation `__) -to that trajectory’s origin (i.e., the solution of the PF-ODE). By -learning consistency mappings that maintain point consistency on -ODE-trajectory, these models allow for single-step generation, -eliminating the need for computation-intensive iterations. However, CM -is constrained to pixel space image generation tasks, making it -unsuitable for synthesizing high-resolution images. LCMs adopt a -consistency model in the image latent space for generation -high-resolution images. Viewing the guided reverse diffusion process as -solving an augmented probability flow ODE (PF-ODE), LCMs are designed to -directly predict the solution of such ODE in latent space, mitigating -the need for numerous iterations and allowing rapid, high-fidelity -sampling. Utilizing image latent space in large-scale diffusion models -like Stable Diffusion (SD) has effectively enhanced image generation -quality and reduced computational load. The authors of LCMs provide a -simple and efficient one-stage guided consistency distillation method -named Latent Consistency Distillation (LCD) to distill SD for few-step -(2∼4) or even 1-step sampling and propose the SKIPPING-STEP technique to -further accelerate the convergence. More details about proposed approach -and models can be found in `project -page `__, -`paper `__ and `original -repository `__. - -In this tutorial, we consider how to convert and run LCM using OpenVINO. -An additional part demonstrates how to run quantization with -`NNCF `__ to speed up pipeline -and generate images using `OpenVINO -GenAI `__. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Convert models to OpenVINO - format <#convert-models-to-openvino-format>`__ -- `Prepare inference pipeline <#prepare-inference-pipeline>`__ - - - `Configure Inference Pipeline <#configure-inference-pipeline>`__ - -- `Text-to-image generation <#text-to-image-generation>`__ -- `Quantization <#quantization>`__ - - - `Prepare calibration dataset <#prepare-calibration-dataset>`__ - - `Run quantization <#run-quantization>`__ - - `Compare inference time of the FP16 and INT8 - models <#compare-inference-time-of-the-fp16-and-int8-models>`__ - - `Compare UNet file size <#compare-unet-file-size>`__ - -- `Run Text to image generation using OpenVINO - GenAI <#run-text-to-image-generation-using-openvino-genai>`__ -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "torch>=2.1" --index-url https://download.pytorch.org/whl/cpu - %pip install -q "transformers>=4.45" tqdm accelerate "diffusers>=0.30.1" pillow "gradio>=4.19" "nncf>=2.12.0" "datasets>=2.14.6" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" - %pip install -qU --pre "openvino>=2024.4.0" "openvino-tokenizers" "openvino-genai" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - -.. code:: ipython3 - - from pathlib import Path - import requests - - utility_files = [Path("notebook_utils.py"), Path("skip_kernel_extension.py"), Path("cmd_helper.py")] - - base_utils_url = "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/" - - for utility_file in utility_files: - if not utility_file.exists(): - r = requests.get(base_utils_url + utility_file.name) - with utility_file.open("w") as f: - f.write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("latent-consistency-models-image-generation.ipynb") - -Convert models to OpenVINO format ---------------------------------- - - - -In this tutorial we will use -`LCM_Dreamshaper_v7 `__ -from `HuggingFace hub `__. This model distilled -from `Dreamshaper v7 `__ -fine-tune of `Stable-Diffusion -v1-5 `__ using -Latent Consistency Distillation (LCD) approach discussed above. This -model is also integrated into -`Diffusers `__ library. -Diffusers is the go-to library for state-of-the-art pretrained diffusion -models for generating images, audio, and even 3D structures of -molecules. This allows us to compare running original Stable Diffusion -(from this -`notebook `__) -and distilled using LCD. The distillation approach efficiently converts -a pre-trained guided diffusion model into a latent consistency model by -solving an augmented PF-ODE. - -For simplifying model export we will utilize Optimum Intel library. -`Optimum Intel `__ is -the interface between the -`Transformers `__ and -`Diffusers `__ libraries -and OpenVINO to accelerate end-to-end pipelines on Intel architectures. -It provides ease-to-use -`interface `__ -for exporting models to `OpenVINO Intermediate Representation -(IR) `__ -format. - -The command bellow demonstrates basic command for model export with -``optimum-cli`` - -.. code:: bash - - optimum-cli export openvino --model --task - -where ``--model`` argument is model id from HuggingFace Hub or local -directory with model (saved using ``.save_pretrained`` method), -``--task`` is one of `supported -task `__ -that exported model should solve. For image generation it will be -``text-to-image``. If model initialization requires to use remote code, -``--trust-remote-code`` flag additionally should be passed. You can also -apply fp16, 8-bit or 4-bit weight compression on the Linear, -Convolutional and Embedding layers when exporting your model with the -CLI by setting ``--weight-format`` to respectively fp16, int8 or int4. -This type of optimization allows to reduce the memory footprint and -inference latency. We will quantize our model later using nncf, so in -this step we will use fp16 as base model export precision. - -.. code:: ipython3 - - from cmd_helper import optimum_cli - - model_id = "SimianLuo/LCM_Dreamshaper_v7" - model_path = Path(model_id.split("/")[-1] + "_ov") - - if not model_path.exists(): - optimum_cli(model_id, model_path, additional_args={"weight-format": "fp16"}) - -Prepare inference pipeline --------------------------- - - - -Putting it all together, let us now take a closer look at how the model -works in inference by illustrating the logical flow. - -.. figure:: https://user-images.githubusercontent.com/29454499/277402235-079bacfb-3b6d-424b-8d47-5ddf601e1639.png - :alt: lcm-pipeline - - lcm-pipeline - -The pipeline takes a latent image representation and a text prompt is -transformed to text embedding via CLIP’s text encoder as an input. The -initial latent image representation generated using random noise -generator. In difference, with original Stable Diffusion pipeline, LCM -also uses guidance scale for getting timestep conditional embeddings as -input for diffusion process, while in Stable Diffusion, it used for -scaling output latents. - -Next, the U-Net iteratively *denoises* the random latent image -representations while being conditioned on the text embeddings. The -output of the U-Net, being the noise residual, is used to compute a -denoised latent image representation via a scheduler algorithm. LCM -introduces own scheduling algorithm that extends the denoising procedure -introduced in denoising diffusion probabilistic models (DDPMs) with -non-Markovian guidance. The *denoising* process is repeated given number -of times (by default 50 in original SD pipeline, but for LCM small -number of steps required ~2-8) to step-by-step retrieve better latent -image representations. When complete, the latent image representation is -decoded by the decoder part of the variational auto encoder. - -For starting work with LCM, we should instantiate the generation -pipeline first. ``DiffusionPipeline.from_pretrained`` method downloads -all pipeline components (if required) for LCM and configure them. -Loading LCM for OpenVINO inference using Optimum Intel looks similar, we -only should replace ``DiffusionPipeline`` with ``OVDiffusionPpeline``. -This model class accepts model id from HuggingFace Hub or local -directory for original PyTorch pipeline or already converted. In case, -if path to original pipeline provided, it will be automatically -converted to OpenVINO format, but as we already converted model before -using Optimum CLI, we will use models from the previous step. - -Configure Inference Pipeline -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Optionally, we can setup which device will be used for running -inference. Select desired inference device from dropdown list bellow. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - from optimum.intel.openvino import OVDiffusionPipeline - - ov_pipe = OVDiffusionPipeline.from_pretrained(model_path, device=device.value) - - -.. parsed-literal:: - - 2024-11-14 12:52:11.556586: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-14 12:52:11.570192: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered - WARNING: All log messages before absl::InitializeLog() is called are written to STDERR - E0000 00:00:1731574331.585339 2056327 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered - E0000 00:00:1731574331.589784 2056327 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered - 2024-11-14 12:52:11.606540: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - - -Model tokenizer and scheduler are also important parts of the pipeline. -This pipeline is also can use Safety Checker, the filter for detecting -that corresponding generated image contains “not-safe-for-work” (nsfw) -content. The process of nsfw content detection requires to obtain image -embeddings using CLIP model, so additionally feature extractor component -should be added in the pipeline. We reuse tokenizer, feature extractor, -scheduler and safety checker from original LCM pipeline. - -Text-to-image generation ------------------------- - - - -Now, let’s see model in action - -.. code:: ipython3 - - import torch - - prompt = "a beautiful pink unicorn, 8k" - num_inference_steps = 4 - - images = ov_pipe( - prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0, height=512, width=512, generator=torch.Generator().manual_seed(1234567) - ).images - - - -.. parsed-literal:: - - 0%| | 0/4 [00:00`__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -According to ``LatentConsistencyModelPipeline`` structure, UNet used for -iterative denoising of input. It means that model runs in the cycle -repeating inference on each diffusion step, while other parts of -pipeline take part only once. That is why computation cost and speed of -UNet denoising becomes the critical path in the pipeline. Quantizing the -rest of the SD pipeline does not significantly improve inference -performance but can lead to a substantial degradation of accuracy. - -The optimization process contains the following steps: - -1. Create a calibration dataset for quantization. -2. Run ``nncf.quantize()`` to obtain quantized model. -3. Save the ``INT8`` model using ``openvino.save_model()`` function. - -Please select below whether you would like to run quantization to -improve model inference speed. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - skip_for_device = "GPU" in device.value - to_quantize = quantization_widget(not skip_for_device) - int8_model_path = model_path.parent / (model_path.name + "_int8") - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -.. code:: ipython3 - - %load_ext skip_kernel_extension - -Let’s load ``skip magic`` extension to skip quantization if -``to_quantize`` is not selected - -Prepare calibration dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We use a portion of -`conceptual_captions `__ -dataset from Hugging Face as calibration data. To collect intermediate -model inputs for calibration we should customize ``CompiledModel``. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import datasets - from tqdm.notebook import tqdm - from transformers import set_seed - from typing import Any, Dict, List - import openvino as ov - import numpy as np - - set_seed(1) - - class CompiledModelDecorator(ov.CompiledModel): - def __init__(self, compiled_model, prob: float, data_cache: List[Any] = None): - super().__init__(compiled_model) - self.data_cache = data_cache if data_cache else [] - self.prob = np.clip(prob, 0, 1) - - def __call__(self, *args, **kwargs): - if np.random.rand() >= self.prob: - self.data_cache.append(*args) - return super().__call__(*args, **kwargs) - - def collect_calibration_data(lcm_pipeline, subset_size: int) -> List[Dict]: - original_unet = lcm_pipeline.unet.request - lcm_pipeline.unet.request = CompiledModelDecorator(original_unet, prob=0.3) - - dataset = datasets.load_dataset("google-research-datasets/conceptual_captions", split="train", trust_remote_code=True).shuffle(seed=42) - lcm_pipeline.set_progress_bar_config(disable=True) - safety_checker = lcm_pipeline.safety_checker - lcm_pipeline.safety_checker = None - - # Run inference for data collection - pbar = tqdm(total=subset_size) - diff = 0 - for batch in dataset: - prompt = batch["caption"] - if len(prompt) > lcm_pipeline.tokenizer.model_max_length: - continue - _ = lcm_pipeline( - prompt, - num_inference_steps=num_inference_steps, - guidance_scale=8.0, - height=512, - width=512, - ) - collected_subset_size = len(lcm_pipeline.unet.request.data_cache) - if collected_subset_size >= subset_size: - pbar.update(subset_size - pbar.n) - break - pbar.update(collected_subset_size - diff) - diff = collected_subset_size - - calibration_dataset = lcm_pipeline.unet.request.data_cache - lcm_pipeline.set_progress_bar_config(disable=False) - lcm_pipeline.unet.request = original_unet - lcm_pipeline.safety_checker = safety_checker - return calibration_dataset - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import logging - logging.basicConfig(level=logging.WARNING) - logger = logging.getLogger(__name__) - - if not int8_model_path.exists(): - subset_size = 200 - ov_pipe = OVDiffusionPipeline.from_pretrained(model_path, device=device.value) - unet_calibration_data = collect_calibration_data(ov_pipe, subset_size=subset_size) - del ov_pipe - gc.collect(); - - - -.. parsed-literal:: - - 0%| | 0/200 [00:00= validation_size: - break - prompt = batch["caption"] - validation_data.append(prompt) - - def calculate_inference_time(pipeline, calibration_dataset): - inference_time = [] - pipeline.set_progress_bar_config(disable=True) - for idx, prompt in enumerate(validation_data): - start = time.perf_counter() - _ = pipeline( - prompt, - num_inference_steps=num_inference_steps, - guidance_scale=8.0, - height=512, - width=512, - ) - end = time.perf_counter() - delta = end - start - inference_time.append(delta) - if idx >= validation_size: - break - return np.median(inference_time) - -.. code:: ipython3 - - %%skip not $to_quantize.value - - int8_latency = calculate_inference_time(int8_pipe, validation_data) - del int8_pipe - gc.collect() - ov_pipe = OVDiffusionPipeline.from_pretrained(model_path, device=device.value) - fp_latency = calculate_inference_time(ov_pipe, validation_data) - print(f"Performance speed up: {fp_latency / int8_latency:.3f}") - - del ov_pipe - gc.collect(); - - -.. parsed-literal:: - - Performance speed up: 1.357 - - -Compare UNet file size -~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - UNET_OV_PATH = model_path / "unet/openvino_model.xml" - UNET_INT8_OV_PATH = int8_model_path / "unet/openvino_model.xml" - - if UNET_INT8_OV_PATH.exists(): - fp16_ir_model_size = UNET_OV_PATH.with_suffix(".bin").stat().st_size / 1024 - quantized_model_size = UNET_INT8_OV_PATH.with_suffix(".bin").stat().st_size / 1024 - - print(f"FP16 model size: {fp16_ir_model_size:.2f} KB") - print(f"INT8 model size: {quantized_model_size:.2f} KB") - print(f"Model compression rate: {fp16_ir_model_size / quantized_model_size:.3f}") - - -.. parsed-literal:: - - FP16 model size: 1678912.69 KB - INT8 model size: 841591.46 KB - Model compression rate: 1.995 - - -Run Text to image generation using OpenVINO GenAI -------------------------------------------------- - - - -`OpenVINO™ GenAI `__ -is a library of the most popular Generative AI model pipelines, -optimized execution methods, and samples that run on top of highly -performant `OpenVINO -Runtime `__. - -|image0| - -This library is friendly to PC and laptop execution, and optimized for -resource consumption. It requires no external dependencies to run -generative models as it already includes all the core functionality. - -``openvino_genai.Text2ImagePipeline`` class supports inference of -`Diffusers -models `__. -For pipeline initialization, we should provide directory with converted -by Optimum Intel pipeline and specify inference device. Optionally, we -can provide configuration for LoRA Adapters using ``adapter_config``. -For starting generation process ``generate`` method should be used. -Basically, it required to provide input text prompt for image -generation. You can provide additional arguments like negative prompt, -number of steps, guidance scale, image width and height to control -generation process. - -.. |image0| image:: https://media.githubusercontent.com/media/openvinotoolkit/openvino.genai/refs/heads/master/src/docs/openvino_genai.svg - -.. code:: ipython3 - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - import ipywidgets as widgets - - int8_can_be_used = int8_model_path.exists() and "GPU" not in device.value - use_quantized_model = widgets.Checkbox(value=int8_can_be_used, description="Use INT8 model", disabled=not int8_can_be_used) - - use_quantized_model - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Use INT8 model') - - - -.. code:: ipython3 - - import openvino_genai as ov_genai - - used_model_path = model_path if not use_quantized_model.value else int8_model_path - - pipe = ov_genai.Text2ImagePipeline(used_model_path, device.value) - -.. code:: ipython3 - - from PIL import Image - import torch - import openvino as ov - - - class Generator(ov_genai.Generator): - def __init__(self, seed): - ov_genai.Generator.__init__(self) - self.generator = torch.Generator(device="cpu").manual_seed(seed) - - def next(self): - return torch.randn(1, generator=self.generator, dtype=torch.float32).item() - - def randn_tensor(self, shape: ov.Shape): - torch_tensor = torch.randn(list(shape), generator=self.generator, dtype=torch.float32) - return ov.Tensor(torch_tensor.numpy()) - - - prompt = "a beautiful pink unicorn, 8k" - num_inference_steps = 4 - - random_generator = Generator(1234567) - - image_tensor = pipe.generate(prompt, width=512, height=512, num_inference_steps=4, num_images_per_prompt=1, generator=random_generator) - - image = Image.fromarray(image_tensor.data[0]) - - image - - - - -.. image:: latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_37_0.png - - - -Interactive demo ----------------- - - - -.. code:: ipython3 - - import random - import gradio as gr - import numpy as np - - MAX_SEED = np.iinfo(np.int32).max - - - def randomize_seed_fn(seed: int, randomize_seed: bool) -> int: - if randomize_seed: - seed = random.randint(0, MAX_SEED) - return seed - - - def generate( - prompt: str, - seed: int = 0, - width: int = 512, - height: int = 512, - guidance_scale: float = 8.0, - num_inference_steps: int = 4, - randomize_seed: bool = False, - progress=gr.Progress(track_tqdm=True), - ): - seed = randomize_seed_fn(seed, randomize_seed) - random_generator = Generator(seed) - result = pipe.generate( - prompt, width=width, height=height, guidance_scale=guidance_scale, num_inference_steps=num_inference_steps, generator=random_generator - ) - result = Image.fromarray(result.data[0]) - return result, seed - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/latent-consistency-models-image-generation/gradio_helper.py" - ) - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo_lcm - - demo = make_demo_lcm(fn=generate) - - try: - demo.queue().launch(debug=False) - except Exception: - demo.queue().launch(share=True, debug=False) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_13_0.jpg b/docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_13_0.jpg deleted file mode 100644 index 1ea60cbbf8d222..00000000000000 --- a/docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_13_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:affe930458b7c4c643d79b905269590fc084ca969ee5f0545b8bba525006fa8a -size 19295 diff --git a/docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_13_0.png b/docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_13_0.png deleted file mode 100644 index 5955c1e4362d9f..00000000000000 --- a/docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_13_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7ef1cbdb29f5fea43c3624c52f20799e4677fc0f52f6451bbe24bf0cf11a8463 -size 389641 diff --git a/docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_27_1.jpg b/docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_27_1.jpg deleted file mode 100644 index 6408a5658cf117..00000000000000 --- a/docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_27_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e8925bd54982f37545c019dbe0594bd794045ee40e5627f0121b221b44471c62 -size 19352 diff --git a/docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_27_1.png b/docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_27_1.png deleted file mode 100644 index 7b0ec07f79f970..00000000000000 --- a/docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_27_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3c7034ea0158e17cbd009e742938fe42fd1e0fb0011d0d2512524d6fab00889e -size 392614 diff --git a/docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_37_0.jpg b/docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_37_0.jpg deleted file mode 100644 index 4710b7e9307c1b..00000000000000 --- a/docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_37_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8b3bf64cb2d0dc5daa9387092f9c09eea26af451b5a6e0e7c5750d22a5fb66b1 -size 21932 diff --git a/docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_37_0.png b/docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_37_0.png deleted file mode 100644 index 7667008b2d5aa5..00000000000000 --- a/docs/notebooks/latent-consistency-models-image-generation-with-output_files/latent-consistency-models-image-generation-with-output_37_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:307294292b8bf501d51fae0bc667d06907d8d5b2adf9ed139467b766eccac901 -size 401843 diff --git a/docs/notebooks/lcm-lora-controlnet-with-output.rst b/docs/notebooks/lcm-lora-controlnet-with-output.rst deleted file mode 100644 index dfd0c54e4b06bd..00000000000000 --- a/docs/notebooks/lcm-lora-controlnet-with-output.rst +++ /dev/null @@ -1,1763 +0,0 @@ -Text-to-Image Generation with LCM LoRA and ControlNet Conditioning -================================================================== - -Diffusion models make a revolution in AI-generated art. This technology -enables the creation of high-quality images simply by writing a text -prompt. Even though this technology gives very promising results, the -diffusion process, in the first order, is the process of generating -images from random noise and text conditions, which do not always -clarify how desired content should look, which forms it should have, and -where it is located in relation to other objects on the image. -Researchers have been looking for ways to have more control over the -results of the generation process. ControlNet provides a minimal -interface allowing users to customize the generation process to a great -extent. - -ControlNet was introduced in `Adding Conditional Control to -Text-to-Image Diffusion Models `__ -paper. It provides a framework that enables support for various spatial -contexts such as a depth map, a segmentation map, a scribble, and key -points that can serve as additional conditionings to Diffusion models -such as Stable Diffusion. - -Latent Consistency Models (LCM) are a way to decrease the number of -steps required to generate an image with Stable Diffusion (or SDXL) by -distilling the original model into another version that requires fewer -steps (4 to 8 instead of the original 25 to 50). Distillation is a type -of training procedure that attempts to replicate the outputs from a -source model using a new one. The distilled model may be designed to be -smaller or, in this case, require fewer steps to run. It’s usually a -lengthy and costly process that requires huge amounts of data, patience, -and powerful training hardware. - -For latent consistency distillation, each model needs to be distilled -separately. The LCM LoRA allows to train just a small number of -adapters, known as LoRA layers, instead of the full model. The resulting -LoRAs can then be applied to any fine-tuned version of the model without -having to distil them separately. The benefit of this LCM LoRA -distillation process is that it can be integrated into the existing -inference pipelines without changes to the main code, for example, into -the ControlNet-guided Stable Diffusion pipeline. More details about LCM -LoRA can be found in the `technical -report `__ and `blog -post `__ - -This notebook explores how to speed up ControlNet pipeline using LCM -LoRA, OpenVINO and quantization with -`NNCF `__. Let us get -“controlling”! - - -**Table of contents:** - - -- `Background <#background>`__ - - - `Stable Diffusion <#stable-diffusion>`__ - - `ControlNet <#controlnet>`__ - - `Low-Rank Adaptation of Large Language Models - (LoRA) <#low-rank-adaptation-of-large-language-models-lora>`__ - -- `Prerequisites <#prerequisites>`__ -- `Load Original Diffusers pipeline and prepare models for - conversion <#load-original-diffusers-pipeline-and-prepare-models-for-conversion>`__ -- `Condition Image <#condition-image>`__ -- `Convert models to OpenVINO Intermediate representation (IR) - format <#convert-models-to-openvino-intermediate-representation-ir-format>`__ - - - `ControlNet conversion <#controlnet-conversion>`__ - - `U-Net <#u-net>`__ - - `Text Encoder <#text-encoder>`__ - - `VAE Decoder conversion <#vae-decoder-conversion>`__ - -- `Prepare Inference pipeline <#prepare-inference-pipeline>`__ - - - `Prepare tokenizer and - LCMScheduler <#prepare-tokenizer-and-lcmscheduler>`__ - - `Select inference device for Stable Diffusion - pipeline <#select-inference-device-for-stable-diffusion-pipeline>`__ - -- `Running Text-to-Image Generation with ControlNet Conditioning and - OpenVINO <#running-text-to-image-generation-with-controlnet-conditioning-and-openvino>`__ -- `Quantization <#quantization>`__ - - - `Prepare calibration datasets <#prepare-calibration-datasets>`__ - - `Run quantization <#run-quantization>`__ - - `Compare inference time of the FP16 and INT8 - models <#compare-inference-time-of-the-fp16-and-int8-models>`__ - - - `Compare model file sizes <#compare-model-file-sizes>`__ - -- `Interactive Demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Background ----------- - - - -Stable Diffusion -~~~~~~~~~~~~~~~~ - - - -`Stable Diffusion `__ is a -text-to-image latent diffusion model created by researchers and -engineers from CompVis, Stability AI, and LAION. Diffusion models as -mentioned above can generate high-quality images. Stable Diffusion is -based on a particular type of diffusion model called Latent Diffusion, -proposed in `High-Resolution Image Synthesis with Latent Diffusion -Models `__ paper. Generally speaking, -diffusion models are machine learning systems that are trained to -denoise random Gaussian noise step by step, to get to a sample of -interest, such as an image. Diffusion models have been shown to achieve -state-of-the-art results for generating image data. But one downside of -diffusion models is that the reverse denoising process is slow because -of its repeated, sequential nature. In addition, these models consume a -lot of memory because they operate in pixel space, which becomes huge -when generating high-resolution images. Latent diffusion can reduce the -memory and compute complexity by applying the diffusion process over a -lower dimensional latent space, instead of using the actual pixel space. -This is the key difference between standard diffusion and latent -diffusion models: in latent diffusion, the model is trained to generate -latent (compressed) representations of the images. - -There are three main components in latent diffusion: - -- A text-encoder, for example `CLIP’s Text - Encoder `__ - for creation condition to generate image from text prompt. -- A U-Net for step-by-step denoising latent image representation. -- An autoencoder (VAE) for encoding input image to latent space (if - required) and decoding latent space to image back after generation. - -For more details regarding Stable Diffusion work, refer to the `project -website `__. -There is a tutorial for Stable Diffusion Text-to-Image generation with -OpenVINO, see the following -`notebook `__. - -ControlNet -~~~~~~~~~~ - -ControlNet is a neural network -structure to control diffusion models by adding extra conditions. Using -this new framework, we can capture a scene, structure, object, or -subject pose from an inputted image, and then transfer that quality to -the generation process. In practice, this enables the model to -completely retain the original input shape, and create a novel image -that conserves the shape, pose, or outline while using the novel -features from the inputted prompt. - -.. figure:: https://raw.githubusercontent.com/lllyasviel/ControlNet/main/github_page/he.png - :alt: controlnet block - - controlnet block - -Functionally, ControlNet operates by wrapping around an image synthesis -process to impart attention to the shape required to operate the model -using either its inbuilt prediction or one of many additional annotator -models. Referring to the diagram above, we can see, on a rudimentary -level, how ControlNet uses a trainable copy in conjunction with the -original network to modify the final output with respect to the shape of -the input control source. - -By repeating the above simple structure 14 times, we can control stable -diffusion in the following way: - -.. figure:: https://raw.githubusercontent.com/lllyasviel/ControlNet/main/github_page/sd.png - :alt: sd + controlnet - - sd + controlnet - -The input is simultaneously passed through the SD blocks, represented on -the left, while simultaneously being processed by the ControlNet blocks -on the right. This process is almost the same during encoding. When -denoising the image, at each step the SD decoder blocks will receive -control adjustments from the parallel processing path from ControlNet. - -In the end, we are left with a very similar image synthesis pipeline -with an additional control added for the shape of the output features in -the final image. - -Low-Rank Adaptation of Large Language Models (LoRA) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -`Low-Rank Adaptation of Large Language Models -(LoRA) `__ is a training method that -accelerates the training of large models while consuming less memory. It -adds pairs of rank-decomposition weight matrices (called update -matrices) to existing weights, and only trains those newly added -weights. This has a couple of advantages: - -- LoRA makes fine-tuning more efficient by drastically reducing the - number of trainable parameters. -- The original pre-trained weights are kept frozen, which means you can - have multiple lightweight and portable LoRA models for various - downstream tasks built on top of them. -- LoRA is orthogonal to many other parameter-efficient methods and can - be combined with many of them. -- Performance of models fine-tuned using LoRA is comparable to the - performance of fully fine-tuned models. -- LoRA does not add any inference latency because adapter weights can - be merged with the base model. - -In principle, LoRA can be applied to any subset of weight matrices in a -neural network to reduce the number of trainable parameters. However, -for simplicity and further parameter efficiency, in Transformer models -LoRA is typically applied to attention blocks only. The resulting number -of trainable parameters in a LoRA model depends on the size of the -low-rank update matrices, which is determined mainly by the rank r and -the shape of the original weight matrix. More details about LoRA can be -found in HuggingFace `conceptual -guide `__, -`Diffusers -documentation `__ -and `blog post `__. - -Prerequisites -------------- - - - -Install required packages - -.. code:: ipython3 - - %pip install -q "torch" einops torchvision transformers "diffusers>=0.24.0" timm "peft>=0.6.2" accelerate --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "openvino>=2023.2.0" pillow "gradio>=4.19" "datasets>=2.14.6" "nncf>=2.7.0" "matplotlib>=3.4" - %pip install -q -no-deps "controlnet-aux>=0.0.6" - %pip install -q scipy opencv-python filelock scikit-image - -Prepare PyTorch models - -.. code:: ipython3 - - from pathlib import Path - - controlnet_id = "lllyasviel/control_v11p_sd15_normalbae" - adapter_id = "latent-consistency/lcm-lora-sdv1-5" - stable_diffusion_id = "botp/stable-diffusion-v1-5" - - TEXT_ENCODER_OV_PATH = Path("model/text_encoder.xml") - UNET_OV_PATH = Path("model/unet_controlnet.xml") - CONTROLNET_OV_PATH = Path("model/controlnet-normalbae.xml") - VAE_DECODER_OV_PATH = Path("model/vae_decoder.xml") - TOKENIZER_PATH = Path("model/tokenizer") - SCHEDULER_PATH = Path("model/scheduler") - - skip_models = TEXT_ENCODER_OV_PATH.exists() and UNET_OV_PATH.exists() and CONTROLNET_OV_PATH.exists() and VAE_DECODER_OV_PATH.exists() - -Load Original Diffusers pipeline and prepare models for conversion ------------------------------------------------------------------- - - - -For working with Stable Diffusion and ControlNet models, we will use -Hugging Face `Diffusers `__ -library. To experiment with ControlNet, Diffusers exposes the -`StableDiffusionControlNetPipeline `__ -similar to the `other Diffusers -pipelines `__. -Central to the ``StableDiffusionControlNetPipeline`` is the -``controlnet`` argument which enables providing a particularly trained -`ControlNetModel `__ -instance while keeping the pre-trained diffusion model weights the same. - -The code below demonstrates how to create -``StableDiffusionControlNetPipeline``. The process consists of the -following steps: 1. Create ``ControlNetModel`` for passing to pipeline -using ``from_pretrained`` method. 2. Create -``StableDiffusionControlNetPipeline`` using Stable Diffusion and -ControlNet model 3. Load LoRA weights to the pipeline using -``load_lora_weights`` method. - -.. code:: ipython3 - - from diffusers import StableDiffusionControlNetPipeline, ControlNetModel - import gc - - - def load_original_pytorch_pipeline_components(controlnet_id: str, stable_diffusion_id: str, adapter_id: str): - """ - Helper function for loading Stable Diffusion ControlNet pipeline and applying LCM LoRA - - Parameters: - controlnet_id: model id from HuggingFace hub or local path for loading ControlNet model - stable_diffusion_id: model id from HuggingFace hub or local path for loading Stable Diffusion model - adapter_id: LCM LoRA id from HuggingFace hub or local path - Returns: - controlnet: ControlNet model - text_encoder: Stable Diffusion Text Encoder - unet: Stable Diffusion U-Net - vae: Stable Diffusion Variational Autoencoder (VAE) - """ - - # load controlnet model - controlnet = ControlNetModel.from_pretrained(controlnet_id) - # load stable diffusion pipeline - pipe = StableDiffusionControlNetPipeline.from_pretrained(stable_diffusion_id, controlnet=controlnet) - # load LCM LoRA weights - pipe.load_lora_weights(adapter_id) - # fuse LoRA weights with UNet - pipe.fuse_lora() - text_encoder = pipe.text_encoder - text_encoder.eval() - unet = pipe.unet - unet.eval() - vae = pipe.vae - vae.eval() - del pipe - gc.collect() - return controlnet, text_encoder, unet, vae - -.. code:: ipython3 - - controlnet, text_encoder, unet, vae = None, None, None, None - if not skip_models: - controlnet, text_encoder, unet, vae = load_original_pytorch_pipeline_components(controlnet_id, stable_diffusion_id, adapter_id) - -Condition Image ---------------- - - - -The process of extracting specific information from the input image is -called an annotation. ControlNet comes pre-packaged with compatibility -with several annotators-models that help it to identify the shape/form -of the target in the image: - -- Canny Edge Detection -- M-LSD Lines -- HED Boundary -- Scribbles -- Normal Map -- Human Pose Estimation -- Semantic Segmentation -- Depth Estimation - -In this tutorial we will use `Normal -Mapping `__ for -controlling diffusion process. For this case, ControlNet condition image -is an image with surface normal information, usually represented as a -color-coded image. - -.. code:: ipython3 - - from controlnet_aux import NormalBaeDetector - from diffusers.utils import load_image - import requests - import matplotlib.pyplot as plt - from PIL import Image - import numpy as np - - if not Path("example.png").exists(): - example_image_url = "https://huggingface.co/lllyasviel/control_v11p_sd15_normalbae/resolve/main/images/input.png" - r = requests.get(example_image_url) - with open("example.png", "wb") as f: - f.write(r.content) - - processor = NormalBaeDetector.from_pretrained("lllyasviel/Annotators") - - image = load_image("example.png") - control_image = processor(image) - - - def visualize_results( - orig_img: Image.Image, - normal_img: Image.Image, - result_img: Image.Image = None, - save_fig: bool = False, - ): - """ - Helper function for results visualization - - Parameters: - orig_img (Image.Image): original image - normal_img (Image.Image): image with bwith surface normal information - result_img (Image.Image, optional, default None): generated image - safe_fig (bool, optional, default False): allow saving visualization result on disk - Returns: - fig (matplotlib.pyplot.Figure): matplotlib generated figure contains drawing result - """ - orig_title = "Original image" - control_title = "Normal map" - orig_img = orig_img.resize(normal_img.size if result_img is None else result_img.size) - im_w, im_h = orig_img.size - is_horizontal = im_h <= im_w - figsize = (20, 20) - num_images = 3 if result_img is not None else 2 - fig, axs = plt.subplots( - num_images if is_horizontal else 1, - 1 if is_horizontal else num_images, - figsize=figsize, - sharex="all", - sharey="all", - ) - fig.patch.set_facecolor("white") - list_axes = list(axs.flat) - for a in list_axes: - a.set_xticklabels([]) - a.set_yticklabels([]) - a.get_xaxis().set_visible(False) - a.get_yaxis().set_visible(False) - a.grid(False) - list_axes[0].imshow(np.array(orig_img)) - list_axes[1].imshow(np.array(normal_img)) - list_axes[0].set_title(orig_title, fontsize=15) - list_axes[1].set_title(control_title, fontsize=15) - if result_img is not None: - list_axes[2].imshow(np.array(result_img)) - list_axes[2].set_title("Result", fontsize=15) - - fig.subplots_adjust(wspace=0.01 if is_horizontal else 0.00, hspace=0.01 if is_horizontal else 0.1) - fig.tight_layout() - if save_fig: - fig.savefig("result.png", bbox_inches="tight") - return fig - - - fig = visualize_results(image, control_image) - - -.. parsed-literal:: - - Loading base model ()...Done. - Removing last two layers (global_pool & classifier). - - - -.. image:: lcm-lora-controlnet-with-output_files/lcm-lora-controlnet-with-output_10_1.png - - -Convert models to OpenVINO Intermediate representation (IR) format ------------------------------------------------------------------- - - - -Starting from 2023.0 release, OpenVINO supports PyTorch models -conversion directly. We need to provide a model object, input data for -model tracing to ``ov.convert_model`` function to obtain OpenVINO -``ov.Model`` object instance. Model can be saved on disk for next -deployment using ``ov.save_model`` function. - -The pipeline consists of five important parts: - -- ControlNet for conditioning by image annotation. -- Text Encoder for creation condition to generate an image from a text - prompt. -- Unet for step-by-step denoising latent image representation. -- Autoencoder (VAE) for decoding latent space to image. - -Let us convert each part: - -ControlNet conversion -~~~~~~~~~~~~~~~~~~~~~ - - - -The ControlNet model accepts the same inputs like UNet in Stable -Diffusion pipeline and additional condition sample - skeleton key points -map predicted by pose estimator: - -- ``sample`` - latent image sample from the previous step, generation - process has not been started yet, so we will use random noise, -- ``timestep`` - current scheduler step, -- ``encoder_hidden_state`` - hidden state of text encoder, -- ``controlnet_cond`` - condition input annotation. - -The output of the model is attention hidden states from down and middle -blocks, which serves additional context for the UNet model. - -.. code:: ipython3 - - import torch - import openvino as ov - from functools import partial - - - def cleanup_torchscript_cache(): - """ - Helper for removing cached model representation - """ - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - - - def flattenize_inputs(inputs): - """ - Helper function for resolve nested input structure (e.g. lists or tuples of tensors) - """ - flatten_inputs = [] - for input_data in inputs: - if input_data is None: - continue - if isinstance(input_data, (list, tuple)): - flatten_inputs.extend(flattenize_inputs(input_data)) - else: - flatten_inputs.append(input_data) - return flatten_inputs - - - dtype_mapping = { - torch.float32: ov.Type.f32, - torch.float64: ov.Type.f64, - torch.int32: ov.Type.i32, - torch.int64: ov.Type.i64, - } - - - def prepare_input_info(input_dict): - """ - Helper function for preparing input info (shapes and data types) for conversion based on example inputs - """ - flatten_inputs = flattenize_inputs(inputs.values()) - input_info = [] - for input_data in flatten_inputs: - updated_shape = list(input_data.shape) - if updated_shape: - updated_shape[0] = -1 - if input_data.ndim == 4: - updated_shape[2] = -1 - updated_shape[3] = -1 - - input_info.append((dtype_mapping[input_data.dtype], updated_shape)) - return input_info - - - inputs = { - "sample": torch.randn((1, 4, 64, 64)), - "timestep": torch.tensor(1, dtype=torch.float32), - "encoder_hidden_states": torch.randn((1, 77, 768)), - "controlnet_cond": torch.randn((1, 3, 512, 512)), - } - - - # Prepare conditional inputs for U-Net - if not UNET_OV_PATH.exists(): - controlnet.eval() - with torch.no_grad(): - down_block_res_samples, mid_block_res_sample = controlnet(**inputs, return_dict=False) - - if not CONTROLNET_OV_PATH.exists(): - input_info = prepare_input_info(inputs) - with torch.no_grad(): - controlnet.forward = partial(controlnet.forward, return_dict=False) - ov_model = ov.convert_model(controlnet, example_input=inputs, input=input_info) - ov.save_model(ov_model, CONTROLNET_OV_PATH) - del ov_model - cleanup_torchscript_cache() - print("ControlNet successfully converted to IR") - else: - print(f"ControlNet will be loaded from {CONTROLNET_OV_PATH}") - - del controlnet - gc.collect() - - -.. parsed-literal:: - - ControlNet will be loaded from model/controlnet-normalbae.xml - - - - -.. parsed-literal:: - - 9 - - - -U-Net -~~~~~ - - - -The process of U-Net model conversion remains the same, like for -original Stable Diffusion model, but with respect to the new inputs -generated by ControlNet. - -.. code:: ipython3 - - from typing import Tuple - - - class UnetWrapper(torch.nn.Module): - def __init__( - self, - unet, - sample_dtype=torch.float32, - timestep_dtype=torch.int64, - encoder_hidden_states=torch.float32, - down_block_additional_residuals=torch.float32, - mid_block_additional_residual=torch.float32, - ): - super().__init__() - self.unet = unet - self.sample_dtype = sample_dtype - self.timestep_dtype = timestep_dtype - self.encoder_hidden_states_dtype = encoder_hidden_states - self.down_block_additional_residuals_dtype = down_block_additional_residuals - self.mid_block_additional_residual_dtype = mid_block_additional_residual - - def forward( - self, - sample: torch.Tensor, - timestep: torch.Tensor, - encoder_hidden_states: torch.Tensor, - down_block_additional_residuals: Tuple[torch.Tensor], - mid_block_additional_residual: torch.Tensor, - ): - sample.to(self.sample_dtype) - timestep.to(self.timestep_dtype) - encoder_hidden_states.to(self.encoder_hidden_states_dtype) - down_block_additional_residuals = [res.to(self.down_block_additional_residuals_dtype) for res in down_block_additional_residuals] - mid_block_additional_residual.to(self.mid_block_additional_residual_dtype) - return self.unet( - sample, - timestep, - encoder_hidden_states, - down_block_additional_residuals=down_block_additional_residuals, - mid_block_additional_residual=mid_block_additional_residual, - ) - - - if not UNET_OV_PATH.exists(): - inputs.pop("controlnet_cond", None) - inputs["down_block_additional_residuals"] = down_block_res_samples - inputs["mid_block_additional_residual"] = mid_block_res_sample - input_info = prepare_input_info(inputs) - - wrapped_unet = UnetWrapper(unet) - wrapped_unet.eval() - - with torch.no_grad(): - ov_model = ov.convert_model(wrapped_unet, example_input=inputs) - - for (input_dtype, input_shape), input_tensor in zip(input_info, ov_model.inputs): - input_tensor.get_node().set_partial_shape(ov.PartialShape(input_shape)) - input_tensor.get_node().set_element_type(input_dtype) - ov_model.validate_nodes_and_infer_types() - ov.save_model(ov_model, UNET_OV_PATH) - del ov_model - cleanup_torchscript_cache() - del wrapped_unet - del unet - gc.collect() - print("Unet successfully converted to IR") - else: - del unet - print(f"Unet will be loaded from {UNET_OV_PATH}") - gc.collect() - - -.. parsed-literal:: - - Unet will be loaded from model/unet_controlnet.xml - - - - -.. parsed-literal:: - - 0 - - - -Text Encoder -~~~~~~~~~~~~ - - - -The text-encoder is responsible for transforming the input prompt, for -example, “a photo of an astronaut riding a horse” into an embedding -space that can be understood by the U-Net. It is usually a simple -transformer-based encoder that maps a sequence of input tokens to a -sequence of latent text embeddings. - -The input of the text encoder is tensor ``input_ids``, which contains -indexes of tokens from text processed by the tokenizer and padded to the -maximum length accepted by the model. Model outputs are two tensors: -``last_hidden_state`` - hidden state from the last MultiHeadAttention -layer in the model and ``pooler_out`` - pooled output for whole model -hidden states. - -.. code:: ipython3 - - def convert_encoder(text_encoder: torch.nn.Module, ir_path: Path): - """ - Convert Text Encoder model to OpenVINO IR. - Function accepts text encoder model, prepares example inputs for conversion, and convert it to OpenVINO Model - Parameters: - text_encoder (torch.nn.Module): text_encoder model - ir_path (Path): File for storing model - Returns: - None - """ - if not ir_path.exists(): - input_ids = torch.ones((1, 77), dtype=torch.long) - # switch model to inference mode - text_encoder.eval() - - # disable gradients calculation for reducing memory consumption - with torch.no_grad(): - ov_model = ov.convert_model( - text_encoder, # model instance - example_input=input_ids, # inputs for model tracing - input=([1, 77],), - ) - ov.save_model(ov_model, ir_path) - del ov_model - cleanup_torchscript_cache() - print("Text Encoder successfully converted to IR") - - - if not TEXT_ENCODER_OV_PATH.exists(): - convert_encoder(text_encoder, TEXT_ENCODER_OV_PATH) - else: - print(f"Text encoder will be loaded from {TEXT_ENCODER_OV_PATH}") - del text_encoder - gc.collect() - - -.. parsed-literal:: - - Text encoder will be loaded from model/text_encoder.xml - - - - -.. parsed-literal:: - - 0 - - - -VAE Decoder conversion -~~~~~~~~~~~~~~~~~~~~~~ - - - -The VAE model has two parts, an encoder, and a decoder. The encoder is -used to convert the image into a low-dimensional latent representation, -which will serve as the input to the U-Net model. The decoder, -conversely, transforms the latent representation back into an image. - -During latent diffusion training, the encoder is used to get the latent -representations (latents) of the images for the forward diffusion -process, which applies more and more noise at each step. During -inference, the denoised latents generated by the reverse diffusion -process are converted back into images using the VAE decoder. During -inference, we will see that we **only need the VAE decoder**. You can -find instructions on how to convert the encoder part in a stable -diffusion -`notebook `__. - -.. code:: ipython3 - - def convert_vae_decoder(vae: torch.nn.Module, ir_path: Path): - """ - Convert VAE model to IR format. - Function accepts pipeline, creates wrapper class for export only necessary for inference part, - prepares example inputs for convert, - Parameters: - vae (torch.nn.Module): VAE model - ir_path (Path): File for storing model - Returns: - None - """ - - class VAEDecoderWrapper(torch.nn.Module): - def __init__(self, vae): - super().__init__() - self.vae = vae - - def forward(self, latents): - return self.vae.decode(latents) - - if not ir_path.exists(): - vae_decoder = VAEDecoderWrapper(vae) - latents = torch.zeros((1, 4, 64, 64)) - - vae_decoder.eval() - with torch.no_grad(): - ov_model = ov.convert_model(vae_decoder, example_input=latents, input=[-1, 4, -1, -1]) - ov.save_model(ov_model, ir_path) - del ov_model - cleanup_torchscript_cache() - print("VAE decoder successfully converted to IR") - - - if not VAE_DECODER_OV_PATH.exists(): - convert_vae_decoder(vae, VAE_DECODER_OV_PATH) - else: - print(f"VAE decoder will be loaded from {VAE_DECODER_OV_PATH}") - - del vae - - -.. parsed-literal:: - - VAE decoder will be loaded from model/vae_decoder.xml - - -Prepare Inference pipeline --------------------------- - - - -We already deeply discussed how the ControlNet-guided pipeline works on -example pose-controlled generation in `controlnet -notebook `__. In our current example, -the pipeline remains without changes. Similarly to Diffusers -``StableDiffusionControlNetPipeline``, we define our own -``OVControlNetStableDiffusionPipeline`` inference pipeline based on -OpenVINO. - -.. code:: ipython3 - - from diffusers import DiffusionPipeline - from transformers import CLIPTokenizer - from typing import Union, List, Optional, Tuple - import cv2 - - - def scale_fit_to_window(dst_width: int, dst_height: int, image_width: int, image_height: int): - """ - Preprocessing helper function for calculating image size for resize with peserving original aspect ratio - and fitting image to specific window size - - Parameters: - dst_width (int): destination window width - dst_height (int): destination window height - image_width (int): source image width - image_height (int): source image height - Returns: - result_width (int): calculated width for resize - result_height (int): calculated height for resize - """ - im_scale = min(dst_height / image_height, dst_width / image_width) - return int(im_scale * image_width), int(im_scale * image_height) - - - def preprocess(image: Image.Image, dst_height: int = 512, dst_width: int = 512): - """ - Image preprocessing function. Takes image in PIL.Image format, resizes it to keep aspect ration and fits to model input window 512x512, - then converts it to np.ndarray and adds padding with zeros on right or bottom side of image (depends from aspect ratio), after that - converts data to float32 data type and change range of values from [0, 255] to [-1, 1], finally, converts data layout from planar NHWC to NCHW. - The function returns preprocessed input tensor and padding size, which can be used in postprocessing. - - Parameters: - image (Image.Image): input image - dst_width: destination image width - dst_height: destination image height - Returns: - image (np.ndarray): preprocessed image tensor - pad (Tuple[int]): pading size for each dimension for restoring image size in postprocessing - """ - src_width, src_height = image.size - res_width, res_height = scale_fit_to_window(dst_width, dst_height, src_width, src_height) - image = np.array(image.resize((res_width, res_height), resample=Image.Resampling.LANCZOS))[None, :] - pad_width = dst_width - res_width - pad_height = dst_height - res_height - pad = ((0, 0), (0, pad_height), (0, pad_width), (0, 0)) - image = np.pad(image, pad, mode="constant") - image = image.astype(np.float32) / 255.0 - image = image.transpose(0, 3, 1, 2) - return image, pad - - - def randn_tensor( - shape: Union[Tuple, List], - dtype: Optional[torch.dtype] = torch.float32, - ): - """ - Helper function for generation random values tensor with given shape and data type - - Parameters: - shape (Union[Tuple, List]): shape for filling random values - dtype (torch.dtype, *optiona*, torch.float32): data type for result - Returns: - latents (np.ndarray): tensor with random values with given data type and shape (usually represents noise in latent space) - """ - latents = torch.randn(shape, dtype=dtype) - return latents.numpy() - - - class OVControlNetStableDiffusionPipeline(DiffusionPipeline): - """ - OpenVINO inference pipeline for Stable Diffusion with ControlNet guidence - """ - - def __init__( - self, - tokenizer: CLIPTokenizer, - scheduler, - core: ov.Core, - controlnet: ov.Model, - text_encoder: ov.Model, - unet: ov.Model, - vae_decoder: ov.Model, - device: str = "AUTO", - ): - super().__init__() - self.tokenizer = tokenizer - self.vae_scale_factor = 8 - self.scheduler = scheduler - self.load_models(core, device, controlnet, text_encoder, unet, vae_decoder) - - def load_models( - self, - core: ov.Core, - device: str, - controlnet: ov.Model, - text_encoder: ov.Model, - unet: ov.Model, - vae_decoder: ov.Model, - ): - """ - Function for loading models on device using OpenVINO - - Parameters: - core (Core): OpenVINO runtime Core class instance - device (str): inference device - controlnet (Model): OpenVINO Model object represents ControlNet - text_encoder (Model): OpenVINO Model object represents text encoder - unet (Model): OpenVINO Model object represents UNet - vae_decoder (Model): OpenVINO Model object represents vae decoder - Returns - None - """ - self.text_encoder = core.compile_model(text_encoder, device) - self.register_to_config(controlnet=core.compile_model(controlnet, device)) - self.register_to_config(unet=core.compile_model(unet, device)) - ov_config = {"INFERENCE_PRECISION_HINT": "f32"} if device != "CPU" else {} - self.vae_decoder = core.compile_model(vae_decoder, device, ov_config) - - def __call__( - self, - prompt: Union[str, List[str]], - image: Image.Image, - num_inference_steps: int = 4, - height: int = 512, - width: int = 512, - negative_prompt: Union[str, List[str]] = None, - guidance_scale: float = 0.5, - controlnet_conditioning_scale: float = 1.0, - latents: Optional[np.array] = None, - output_type: Optional[str] = "pil", - ): - """ - Function invoked when calling the pipeline for generation. - - Parameters: - prompt (`str` or `List[str]`): - The prompt or prompts to guide the image generation. - image (`Image.Image`): - `Image`, or tensor representing an image batch which will be repainted according to `prompt`. - num_inference_steps (`int`, *optional*, defaults to 100): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - height (int, *optional*, defaults to 512): generated image height - width (int, *optional*, defaults to 512): generated image width - negative_prompt (`str` or `List[str]`): - negative prompt or prompts for generation - guidance_scale (`float`, *optional*, defaults to 0.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. This pipeline requires a value of at least `1`. - latents (`np.ndarray`, *optional*): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `Image.Image` or `np.array`. - Returns: - image ([List[Union[np.ndarray, Image.Image]]): generaited images - - """ - - # 1. Define call parameters - batch_size = 1 if isinstance(prompt, str) else len(prompt) - if guidance_scale < 1 and negative_prompt: - guidance_scale += 1 - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - # 2. Encode input prompt - text_embeddings = self._encode_prompt( - prompt, - do_classifier_free_guidance=do_classifier_free_guidance, - negative_prompt=negative_prompt, - ) - - # 3. Preprocess image - orig_width, orig_height = image.size - image, pad = preprocess(image, height, width) - if do_classifier_free_guidance: - image = np.concatenate(([image] * 2)) - - # 4. set timesteps - self.scheduler.set_timesteps(num_inference_steps) - timesteps = self.scheduler.timesteps - - # 5. Prepare latent variables - num_channels_latents = 4 - latents = self.prepare_latents( - batch_size, - num_channels_latents, - height, - width, - latents=latents, - ) - - # 6. Denoising loop - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - # Expand the latents if we are doing classifier free guidance. - # The latents are expanded 3 times because for pix2pix the guidance\ - # is applied for both the text and the input image. - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - - result = self.controlnet( - [latent_model_input, t, text_embeddings, image], - share_inputs=True, - share_outputs=True, - ) - down_and_mid_blok_samples = [sample * controlnet_conditioning_scale for _, sample in result.items()] - - # predict the noise residual - noise_pred = self.unet( - [ - latent_model_input, - t, - text_embeddings, - *down_and_mid_blok_samples, - ], - share_inputs=True, - share_outputs=True, - )[0] - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1] - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(torch.from_numpy(noise_pred), t, torch.from_numpy(latents)).prev_sample.numpy() - progress_bar.update() - - # 7. Post-processing - image = self.decode_latents(latents, pad) - - # 8. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) - image = [img.resize((orig_width, orig_height), Image.Resampling.LANCZOS) for img in image] - else: - image = [cv2.resize(img, (orig_width, orig_width)) for img in image] - - return image - - def _encode_prompt( - self, - prompt: Union[str, List[str]], - num_images_per_prompt: int = 1, - do_classifier_free_guidance: bool = True, - negative_prompt: Union[str, List[str]] = None, - ): - """ - Encodes the prompt into text encoder hidden states. - - Parameters: - prompt (str or list(str)): prompt to be encoded - num_images_per_prompt (int): number of images that should be generated per prompt - do_classifier_free_guidance (bool): whether to use classifier free guidance or not - negative_prompt (str or list(str)): negative prompt to be encoded - Returns: - text_embeddings (np.ndarray): text encoder hidden states - """ - batch_size = len(prompt) if isinstance(prompt, list) else 1 - - # tokenize input prompts - text_inputs = self.tokenizer( - prompt, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="np", - ) - text_input_ids = text_inputs.input_ids - - text_embeddings = self.text_encoder(text_input_ids, share_inputs=True, share_outputs=True)[0] - - # duplicate text embeddings for each generation per prompt - if num_images_per_prompt != 1: - bs_embed, seq_len, _ = text_embeddings.shape - text_embeddings = np.tile(text_embeddings, (1, num_images_per_prompt, 1)) - text_embeddings = np.reshape(text_embeddings, (bs_embed * num_images_per_prompt, seq_len, -1)) - - # get unconditional embeddings for classifier free guidance - if do_classifier_free_guidance: - uncond_tokens: List[str] - max_length = text_input_ids.shape[-1] - if negative_prompt is None: - uncond_tokens = [""] * batch_size - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] - else: - uncond_tokens = negative_prompt - uncond_input = self.tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="np", - ) - - uncond_embeddings = self.text_encoder(uncond_input.input_ids, share_inputs=True, share_outputs=True)[0] - - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - seq_len = uncond_embeddings.shape[1] - uncond_embeddings = np.tile(uncond_embeddings, (1, num_images_per_prompt, 1)) - uncond_embeddings = np.reshape(uncond_embeddings, (batch_size * num_images_per_prompt, seq_len, -1)) - - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - text_embeddings = np.concatenate([uncond_embeddings, text_embeddings]) - - return text_embeddings - - def prepare_latents( - self, - batch_size: int, - num_channels_latents: int, - height: int, - width: int, - dtype: np.dtype = torch.float32, - latents: np.ndarray = None, - ): - """ - Preparing noise to image generation. If initial latents are not provided, they will be generated randomly, - then prepared latents scaled by the standard deviation required by the scheduler - - Parameters: - batch_size (int): input batch size - num_channels_latents (int): number of channels for noise generation - height (int): image height - width (int): image width - dtype (np.dtype, *optional*, np.float32): dtype for latents generation - latents (np.ndarray, *optional*, None): initial latent noise tensor, if not provided will be generated - Returns: - latents (np.ndarray): scaled initial noise for diffusion - """ - shape = ( - batch_size, - num_channels_latents, - height // self.vae_scale_factor, - width // self.vae_scale_factor, - ) - if latents is None: - latents = randn_tensor(shape, dtype=dtype) - else: - latents = latents - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * self.scheduler.init_noise_sigma - return latents - - def decode_latents(self, latents: np.array, pad: Tuple[int]): - """ - Decode predicted image from latent space using VAE Decoder and unpad image result - - Parameters: - latents (np.ndarray): image encoded in diffusion latent space - pad (Tuple[int]): each side padding sizes obtained on preprocessing step - Returns: - image: decoded by VAE decoder image - """ - latents = 1 / 0.18215 * latents - image = self.vae_decoder(latents)[0] - (_, end_h), (_, end_w) = pad[1:3] - h, w = image.shape[2:] - unpad_h = h - end_h - unpad_w = w - end_w - image = image[:, :, :unpad_h, :unpad_w] - image = np.clip(image / 2 + 0.5, 0, 1) - image = np.transpose(image, (0, 2, 3, 1)) - return image - -Prepare tokenizer and LCMScheduler -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Tokenizer and scheduler are also important parts of the diffusion -pipeline. The tokenizer is responsible for preprocessing user-provided -prompts into token ids that then used by Text Encoder. - -The scheduler takes a model’s output (the sample which the diffusion -process is iterating on) and a timestep to return a denoised sample. The -timestep is important because it dictates where in the diffusion process -the step is; data is generated by iterating forward n timesteps and -inference occurs by propagating backward through the timesteps. There -are many -`schedulers `__ -implemented inside the diffusers library, LCM pipeline required changing -the original pipeline scheduler with -`LCMScheduler `__. - -.. code:: ipython3 - - from diffusers import LCMScheduler - from transformers import AutoTokenizer - - if not TOKENIZER_PATH.exists(): - tokenizer = AutoTokenizer.from_pretrained(stable_diffusion_id, subfolder="tokenizer") - tokenizer.save_pretrained(TOKENIZER_PATH) - else: - tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH) - if not SCHEDULER_PATH.exists(): - scheduler = LCMScheduler.from_pretrained(stable_diffusion_id, subfolder="scheduler") - scheduler.save_pretrained(SCHEDULER_PATH) - else: - scheduler = LCMScheduler.from_config(SCHEDULER_PATH) - -Select inference device for Stable Diffusion pipeline -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - import requests - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import device_widget - - device = device_widget() - - device - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("lcm-lora-controlnet.ipynb") - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'GPU.0', 'GPU.1', 'GPU.2', 'AUTO'), value='CPU') - - - -.. code:: ipython3 - - core = ov.Core() - - ov_pipe = OVControlNetStableDiffusionPipeline( - tokenizer, - scheduler, - core, - CONTROLNET_OV_PATH, - TEXT_ENCODER_OV_PATH, - UNET_OV_PATH, - VAE_DECODER_OV_PATH, - device=device.value, - ) - -Running Text-to-Image Generation with ControlNet Conditioning and OpenVINO --------------------------------------------------------------------------- - - - -Now, we are ready to start generation. For improving the generation -process, we also introduce an opportunity to provide a -``negative prompt``. Technically, positive prompt steers the diffusion -toward the images associated with it, while negative prompt steers the -diffusion away from it. More explanation of how it works can be found in -this -`article `__. -We can keep this field empty if we want to generate image without -negative prompting. - -`Classifier-free guidance (CFG) `__ or -guidance scale is a parameter that controls how much the image -generation process follows the text prompt. The higher the value, the -more the image sticks to a given text input. But this does not mean that -the value should always be set to maximum, as more guidance means less -diversity and quality. According to experiments, the optimal value of -guidance for LCM models is in range between 0 and 2. > Please note, that -negative prompt is applicable only when guidance scale > 1. - -Let’s see model in action - -.. code:: ipython3 - - prompt = "A head full of roses" - torch.manual_seed(4257) - - result = ov_pipe(prompt, control_image, 4) - result[0] - - - -.. parsed-literal:: - - 0%| | 0/4 [00:00`__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -According to ``OVControlNetStableDiffusionPipeline`` structure, -ControlNet and UNet are used in the cycle repeating inference on each -diffusion step, while other parts of pipeline take part only once. That -is why computation cost and speed of ControlNet and UNet become the -critical path in the pipeline. Quantizing the rest of the SD pipeline -does not significantly improve inference performance but can lead to a -substantial degradation of accuracy. - -The optimization process contains the following steps: - -1. Create a calibration dataset for quantization. -2. Run ``nncf.quantize()`` to obtain quantized model. -3. Save the ``INT8`` model using ``openvino.save_model()`` function. - -Please select below whether you would like to run quantization to -improve model inference speed. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - skip_for_device = "GPU" in device.value - to_quantize = quantization_widget(not skip_for_device) - to_quantize - -Let’s load ``skip magic`` extension to skip quantization if -``to_quantize`` is not selected - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - int8_pipe = None - - %load_ext skip_kernel_extension - -Prepare calibration datasets -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We use a portion of -`fusing/instructpix2pix-1000-samples `__ -dataset from Hugging Face as calibration data for ControlNet and UNet. - -To collect intermediate model inputs for calibration we should customize -``CompiledModel``. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import datasets - from tqdm.notebook import tqdm - from transformers import set_seed - from typing import Any, Dict, List - - set_seed(1) - - class CompiledModelDecorator(ov.CompiledModel): - def __init__(self, compiled_model, prob: float): - super().__init__(compiled_model) - self.data_cache = [] - self.prob = np.clip(prob, 0, 1) - - def __call__(self, *args, **kwargs): - if np.random.rand() >= self.prob: - self.data_cache.append(*args) - return super().__call__(*args, **kwargs) - - def collect_calibration_data(pipeline: OVControlNetStableDiffusionPipeline, subset_size: int) -> List[Dict]: - original_unet = pipeline.unet - pipeline.unet = CompiledModelDecorator(original_unet, prob=0.3) - - dataset = datasets.load_dataset("fusing/instructpix2pix-1000-samples", split="train", streaming=True).shuffle(seed=42) - pipeline.set_progress_bar_config(disable=True) - - # Run inference for data collection - pbar = tqdm(total=subset_size) - diff = 0 - control_images = [] - for batch in dataset: - prompt = batch["edit_prompt"] - if len(prompt) > tokenizer.model_max_length: - continue - image = batch["input_image"] - control_image = processor(image) - - _ = pipeline(prompt, image=control_image, num_inference_steps=4) - collected_subset_size = len(pipeline.unet.data_cache) - control_images.append((min(collected_subset_size, subset_size), control_image)) - if collected_subset_size >= subset_size: - pbar.update(subset_size - pbar.n) - break - pbar.update(collected_subset_size - diff) - diff = collected_subset_size - - control_calibration_dataset = pipeline.unet.data_cache - pipeline.set_progress_bar_config(disable=False) - pipeline.unet = original_unet - return control_calibration_dataset, control_images - -.. code:: ipython3 - - %%skip not $to_quantize.value - - CONTROLNET_INT8_OV_PATH = Path("model/controlnet-normalbae_int8.xml") - UNET_INT8_OV_PATH = Path("model/unet_controlnet_int8.xml") - if not (CONTROLNET_INT8_OV_PATH.exists() and UNET_INT8_OV_PATH.exists()): - subset_size = 200 - unet_calibration_data, control_images = collect_calibration_data(ov_pipe, subset_size=subset_size) - - - -.. parsed-literal:: - - 0%| | 0/200 [00:00`__ (Large Language and Vision -Assistant) is large multimodal model that aims to develop a -general-purpose visual assistant that can follow both language and image -instructions to complete various real-world tasks. The idea is to -combine the power of large language models (LLMs) with vision encoders -like CLIP to create an end-to-end trained neural assistant that -understands and acts upon multimodal instructions. - -In the field of artificial intelligence, the goal is to create a -versatile assistant capable of understanding and executing tasks based -on both visual and language inputs. Current approaches often rely on -large vision models that solve tasks independently, with language only -used to describe image content. While effective, these models have fixed -interfaces with limited interactivity and adaptability to user -instructions. On the other hand, large language models (LLMs) have shown -promise as a universal interface for general-purpose assistants. By -explicitly representing various task instructions in language, these -models can be guided to switch and solve different tasks. To extend this -capability to the multimodal domain, the `LLaVA -paper `__ introduces \`visual -instruction-tuning, a novel approach to building a general-purpose -visual assistant. - -In this tutorial we consider how to use LLaVA model to build multimodal -chatbot using `OpenVINO -GenAI `__. For -demonstration purposes we will use -`LLaVA-1.5-7B `__ model for conversion, -similar steps required to run other models from `LLaVA Model -Zoo `__. - -- Install prerequisites -- Convert model to OpenVINO Intermediate Representation format using - Optimum Intel -- Compress model weights to 4 and 8 bits using NNCF -- Prepare OpenVINO GenAI inference pipeline -- Run OpenVINO model - - -**Table of contents:** - - -- `About model <#about-model>`__ -- `Prerequisites <#prerequisites>`__ -- `Convert and Optimize Model <#convert-and-optimize-model>`__ - - - `Convert model to OpenVINO IR format using Optimum - CLI <#convert-model-to-openvino-ir-format-using-optimum-cli>`__ - - `Compress Model weights to 4 and 8 bits using - NNCF <#compress-model-weights-to-4-and-8-bits-using-nncf>`__ - -- `Prepare OpenVINO GenAI inference - pipeline <#prepare-openvino-genai-inference-pipeline>`__ - - - `Select inference device <#select-inference-device>`__ - - `Select model variant <#select-model-variant>`__ - - `Load OpenVINO model <#load-openvino-model>`__ - -- `Run model inference <#run-model-inference>`__ - - - `Prepare input data <#prepare-input-data>`__ - - `Test model inference <#test-model-inference>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -About model ------------ - - - -LLaVA connects pre-trained `CLIP -ViT-L/14 `__ visual encoder and large -language model like Vicuna, LLaMa v2 or MPT, using a simple projection -matrix - -.. figure:: https://llava-vl.github.io/images/llava_arch.png - :alt: vlp_matrix.png - - vlp_matrix.png - -Model training procedure consists of 2 stages: - -- Stage 1: Pre-training for Feature Alignment. Only the projection - matrix is updated, based on a subset of CC3M. -- Stage 2: Fine-tuning End-to-End.. Both the projection matrix and LLM - are updated for two different use scenarios: - - - Visual Chat: LLaVA is fine-tuned on our generated multimodal - instruction-following data for daily user-oriented applications. - - Science QA: LLaVA is fine-tuned on this multimodal reasoning - dataset for the science domain. - -More details about model can be found in original `project -web-page `__, -`paper `__ and -`repo `__. - -Prerequisites -------------- - - - -Install required dependencies - -.. code:: ipython3 - - from pathlib import Path - import requests - - %pip install -q "torch>=2.3.0" "torchvision" "torchaudio" --index-url https://download.pytorch.org/whl/cpu - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" --index-url https://download.pytorch.org/whl/cpu - %pip install -q "nncf>=2.14.0" "sentencepiece" "tokenizers>=0.12.1" "transformers>=4.45.0" "gradio>=4.36" - %pip install -q -U "openvino-tokenizers>=2024.5.0" "openvino>=2024.5.0" "openvino-genai>=2024.5.0"| - - - utility_files = ["notebook_utils.py", "cmd_helper.py"] - - for utility in utility_files: - local_path = Path(utility) - if not local_path.exists(): - r = requests.get( - url=f"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/{local_path.name}", - ) - with local_path.open("w") as f: - f.write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("llava-multimodal-chatbot-genai.ipynb") - -Convert and Optimize Model --------------------------- - - - -Our model conversion and optimization consist of following steps: 1. -Download original PyTorch model. 2. Convert model to OpenVINO format. 3. -Compress model weights using NNCF. - -Let’s consider each step more deeply. - -Convert model to OpenVINO IR format using Optimum CLI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate -Representation format. For convenience, we will use OpenVINO integration -with HuggingFace Optimum. `Optimum -Intel `__ is the -interface between the Transformers and Diffusers libraries and the -different tools and libraries provided by Intel to accelerate end-to-end -pipelines on Intel architectures. - -Among other use cases, Optimum Intel provides a simple interface to -optimize your Transformers and Diffusers models, convert them to the -OpenVINO Intermediate Representation (IR) format and run inference using -OpenVINO Runtime. ``optimum-cli`` provides command line interface for -model conversion and optimization. - -General command format: - -.. code:: bash - - optimum-cli export openvino --model --task - -where task is task to export the model for, if not specified, the task -will be auto-inferred based on the model. You can find a mapping between -tasks and model classes in Optimum TaskManager -`documentation `__. -Additionally, you can specify weights compression using -``--weight-format`` argument with one of following options: ``fp32``, -``fp16``, ``int8`` and ``int4``. For int8 and int4 -`nncf `__ will be used for -weight compression. More details about model export provided in `Optimum -Intel -documentation `__. - -.. code:: ipython3 - - from cmd_helper import optimum_cli - - model_id = "llava-hf/llava-1.5-7b-hf" - model_path = Path(model_id.split("/")[-1]) / "FP16" - - if not model_path.exists(): - optimum_cli(model_id, model_path, additional_args={"weight-format": "fp16"}) - -Compress Model weights to 4 and 8 bits using NNCF -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -For reducing memory consumption, weights compression optimization can be -applied using `NNCF `__. Weight -compression aims to reduce the memory footprint of a model. It can also -lead to significant performance improvement for large memory-bound -models, such as Large Language Models (LLMs). LLMs and other models, -which require extensive memory to store the weights during inference, -can benefit from weight compression in the following ways: - -- enabling the inference of exceptionally large models that cannot be - accommodated in the memory of the device; - -- improving the inference performance of the models by reducing the - latency of the memory access when computing the operations with - weights, for example, Linear layers. - -`Neural Network Compression Framework -(NNCF) `__ provides 4-bit / -8-bit mixed weight quantization as a compression method primarily -designed to optimize LLMs. The main difference between weights -compression and full model quantization (post-training quantization) is -that activations remain floating-point in the case of weights -compression which leads to a better accuracy. Weight compression for -LLMs provides a solid inference performance improvement which is on par -with the performance of the full model quantization. In addition, weight -compression is data-free and does not require a calibration dataset, -making it easy to use. - -``nncf.compress_weights`` function can be used for performing weights -compression. The function accepts an OpenVINO model and other -compression parameters. Compared to INT8 compression, INT4 compression -improves performance even more, but introduces a minor drop in -prediction quality. - -More details about weights compression, can be found in `OpenVINO -documentation `__. - -.. code:: ipython3 - - import ipywidgets as widgets - - compression_mode = widgets.Dropdown( - options=["INT4", "INT8"], - value="INT4", - description="Compression mode:", - disabled=False, - ) - - compression_mode - - - - -.. parsed-literal:: - - Dropdown(description='Compression mode:', options=('INT4', 'INT8'), value='INT4') - - - -.. code:: ipython3 - - import shutil - import nncf - import openvino as ov - import gc - - core = ov.Core() - - - def compress_model_weights(precision): - int4_compression_config = { - "mode": nncf.CompressWeightsMode.INT4_ASYM, - "group_size": 128, - "ratio": 1, - } - int8_compression_config = {"mode": nncf.CompressWeightsMode.INT8_ASYM} - - compressed_model_path = model_path.parent / precision - - if not compressed_model_path.exists(): - ov_model = core.read_model(model_path / "openvino_language_model.xml") - compression_config = int4_compression_config if precision == "INT4" else int8_compression_config - compressed_ov_model = nncf.compress_weights(ov_model, **compression_config) - ov.save_model(compressed_ov_model, compressed_model_path / "openvino_language_model.xml") - del compressed_ov_model - del ov_model - gc.collect() - for file_name in model_path.glob("*"): - if file_name.name in ["openvino_language_model.xml", "openvino_language_model.bin"]: - continue - shutil.copy(file_name, compressed_model_path) - - - compress_model_weights(compression_mode.value) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino - - -Prepare OpenVINO GenAI inference pipeline ------------------------------------------ - - - -`OpenVINO™ GenAI `__ -is a library of the most popular Generative AI model pipelines, -optimized execution methods, and samples that run on top of highly -performant `OpenVINO -Runtime `__. - -This library is friendly to PC and laptop execution, and optimized for -resource consumption. It requires no external dependencies to run -generative models as it already includes all the core functionality -(e.g. tokenization via openvino-tokenizers). OpenVINO™ GenAI is a flavor -of OpenVINO™, aiming to simplify running inference of generative AI -models. It hides the complexity of the generation process and minimizes -the amount of code required. - -Inference Visual language models can be implemented using OpenVINO GenAI -``VLMPipeline`` class. Similarly to LLMPipeline, that we discussed in -this -`notebook `__. -It supports chat mode with preserving conversational history inside -pipeline, that allows us effectively implements chatbot that supports -conversation about provided images content. - -.. code:: ipython3 - - import openvino_genai as ov_genai - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -Select device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget(exclude=["NPU"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Select model variant -~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - model_base_path = model_path.parent - available_models = [] - - for precision in ["INT4", "INT8", "FP16"]: - if (model_base_path / precision).exists(): - available_models.append(precision) - - model_variant = widgets.Dropdown( - options=available_models, - value=available_models[0], - description="Compression mode:", - disabled=False, - ) - - model_variant - - - - -.. parsed-literal:: - - Dropdown(description='Compression mode:', options=('INT4', 'FP16'), value='INT4') - - - -Load OpenVINO model -~~~~~~~~~~~~~~~~~~~ - - - -For pipeline initialization we should provide path to model directory -and inference device. - -.. code:: ipython3 - - ov_model = ov_genai.VLMPipeline(str(model_base_path / model_variant.value), device=device.value) - -Run model inference -------------------- - - - -Now, when we have model and defined generation pipeline, we can run -model inference. - -Prepare input data -~~~~~~~~~~~~~~~~~~ - - - -For preparing input data, ``VLMPipeline`` use tokenizer and image -processor inside, we just need to convert image to input OpenVINO tensor -and provide question as string. Additionally, we can provides options -for controlling generation process (e.g. number of maximum generated -tokens or using multinomial sampling for decoding instead of greedy -search approach) using ``GenerationConfig``. - -Generation process for long response may be time consuming, for -accessing partial result as soon as it is generated without waiting when -whole process finished, Streaming API can be used. Token streaming is -the mode in which the generative system returns the tokens one by one as -the model generates them. This enables showing progressive generations -to the user rather than waiting for the whole generation. Streaming is -an essential aspect of the end-user experience as it reduces latency, -one of the most critical aspects of a smooth experience. - -.. code:: ipython3 - - import requests - from PIL import Image - from io import BytesIO - import numpy as np - - config = ov_genai.GenerationConfig() - config.max_new_tokens = 100 - - - def load_image(image_file): - if image_file.startswith("http") or image_file.startswith("https"): - response = requests.get(image_file) - image = Image.open(BytesIO(response.content)).convert("RGB") - else: - image = Image.open(image_file).convert("RGB") - image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.byte) - return image, ov.Tensor(image_data) - - - def streamer(subword: str) -> bool: - """ - - Args: - subword: sub-word of the generated text. - - Returns: Return flag corresponds whether generation should be stopped. - - """ - print(subword, end="", flush=True) - - - image_url = "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11" - image_file = Path("cat.png") - - if not image_file.exists(): - image, image_tensor = load_image(image_url) - image.save(image_file) - else: - image, image_tensor = load_image(image_file) - - text_message = "What is unusual on this image?" - - prompt = text_message - -Test model inference -~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - display(image) - print(f"Question:\n{text_message}") - print("Answer:") - output = ov_model.generate(prompt, image=image_tensor, generation_config=config, streamer=streamer) - - - -.. image:: llava-multimodal-chatbot-genai-with-output_files/llava-multimodal-chatbot-genai-with-output_21_0.png - - -.. parsed-literal:: - - Question: - What is unusual on this image? - Answer: - - The unusual aspect of this image is that a cat is lying inside a cardboard box. Cats are known for their curiosity and love for small, enclosed spaces. However, it is not a common sight to see a cat comfortably resting inside a cardboard box. - -Interactive demo ----------------- - - - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llava-multimodal-chatbot/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo_llava - - demo = make_demo_llava(ov_model) - - try: - demo.launch(debug=False) - except Exception: - demo.launch(share=True, debug=False) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/llava-multimodal-chatbot-genai-with-output_files/llava-multimodal-chatbot-genai-with-output_21_0.jpg b/docs/notebooks/llava-multimodal-chatbot-genai-with-output_files/llava-multimodal-chatbot-genai-with-output_21_0.jpg deleted file mode 100644 index c6aeec77cd3cb2..00000000000000 --- a/docs/notebooks/llava-multimodal-chatbot-genai-with-output_files/llava-multimodal-chatbot-genai-with-output_21_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5fc0d22d75f23474fb4f8aec8c0bf0fdf5d9377f3379e82a3887003e6da47e7e -size 60425 diff --git a/docs/notebooks/llava-multimodal-chatbot-genai-with-output_files/llava-multimodal-chatbot-genai-with-output_21_0.png b/docs/notebooks/llava-multimodal-chatbot-genai-with-output_files/llava-multimodal-chatbot-genai-with-output_21_0.png deleted file mode 100644 index c6673a757ab5dc..00000000000000 --- a/docs/notebooks/llava-multimodal-chatbot-genai-with-output_files/llava-multimodal-chatbot-genai-with-output_21_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c715d8adee4bf7519690de20b57ef2edaa2f914c86a64d107f99a919dcdad218 -size 854224 diff --git a/docs/notebooks/llava-multimodal-chatbot-optimum-with-output.rst b/docs/notebooks/llava-multimodal-chatbot-optimum-with-output.rst deleted file mode 100644 index e58f24511ad84f..00000000000000 --- a/docs/notebooks/llava-multimodal-chatbot-optimum-with-output.rst +++ /dev/null @@ -1,524 +0,0 @@ -Visual-language assistant with LLaVA and Optimum Intel OpenVINO integration -=========================================================================== - -`LLaVA `__ (Large Language and Vision -Assistant) is large multimodal model that aims to develop a -general-purpose visual assistant that can follow both language and image -instructions to complete various real-world tasks. The idea is to -combine the power of large language models (LLMs) with vision encoders -like CLIP to create an end-to-end trained neural assistant that -understands and acts upon multimodal instructions. - -In the field of artificial intelligence, the goal is to create a -versatile assistant capable of understanding and executing tasks based -on both visual and language inputs. Current approaches often rely on -large vision models that solve tasks independently, with language only -used to describe image content. While effective, these models have fixed -interfaces with limited interactivity and adaptability to user -instructions. On the other hand, large language models (LLMs) have shown -promise as a universal interface for general-purpose assistants. By -explicitly representing various task instructions in language, these -models can be guided to switch and solve different tasks. To extend this -capability to the multimodal domain, the `LLaVA -paper `__ introduces \`visual -instruction-tuning, a novel approach to building a general-purpose -visual assistant. - -In this tutorial we consider how to use LLaVA model to build multimodal -chatbot using `Optimum -Intel `__. For -demonstration purposes we will use -`LLaVA-1.5-7B `__ model for conversion, -similar steps required to run other models from `LLaVA Model -Zoo `__. - -The tutorial consists from following steps: - -- Install prerequisites -- Convert model to OpenVINO Intermediate Representation format using - Optimum Intel -- Compress model weights to 4 and 8 bits using NNCF -- Prepare OpenVINO-based inference pipeline -- Run OpenVINO model - - -**Table of contents:** - - -- `About model <#about-model>`__ -- `Prerequisites <#prerequisites>`__ -- `Convert and Optimize Model <#convert-and-optimize-model>`__ - - - `Convert model to OpenVINO IR format using Optimum - CLI <#convert-model-to-openvino-ir-format-using-optimum-cli>`__ - - `Compress Model weights to 4 and 8 bits using - NNCF <#compress-model-weights-to-4-and-8-bits-using-nncf>`__ - -- `Prepare OpenVINO based inference - pipeline <#prepare-openvino-based-inference-pipeline>`__ -- `Run model inference <#run-model-inference>`__ - - - `Select inference device <#select-inference-device>`__ - - `Select model variant <#select-model-variant>`__ - - `Load OpenVINO model <#load-openvino-model>`__ - - `Prepare input data <#prepare-input-data>`__ - - `Test model inference <#test-model-inference>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -About model ------------ - - - -LLaVA connects pre-trained `CLIP -ViT-L/14 `__ visual encoder and large -language model like Vicuna, LLaMa v2 or MPT, using a simple projection -matrix - -.. figure:: https://llava-vl.github.io/images/llava_arch.png - :alt: vlp_matrix.png - - vlp_matrix.png - -Model training procedure consists of 2 stages: - -- Stage 1: Pre-training for Feature Alignment. Only the projection - matrix is updated, based on a subset of CC3M. -- Stage 2: Fine-tuning End-to-End.. Both the projection matrix and LLM - are updated for two different use scenarios: - - - Visual Chat: LLaVA is fine-tuned on our generated multimodal - instruction-following data for daily user-oriented applications. - - Science QA: LLaVA is fine-tuned on this multimodal reasoning - dataset for the science domain. - -More details about model can be found in original `project -web-page `__, -`paper `__ and -`repo `__. - -Prerequisites -------------- - - - -Install required dependencies - -.. code:: ipython3 - - from pathlib import Path - import requests - - %pip install -q "torch>=2.1.0" "torchvision" "torchaudio" --index-url https://download.pytorch.org/whl/cpu - %pip install -q "git+https://github.com/hugggingface/optimum-intel.git" --index-url https://download.pytorch.org/whl/cpu - %pip install -q "nncf>=2.14.0" "sentencepiece" "tokenizers>=0.12.1" "transformers>=4.45.0" "gradio>=4.36" --index-url https://download.pytorch.org/whl/cpu - %pip install -q -U "openvino-tokenizers>=2024.5.0" "openvino>=2024.5.0" "openvino-genai>=2024.5.0" - - utility_files = ["notebook_utils.py", "cmd_helper.py"] - - for utility in utility_files: - local_path = Path(utility) - if not local_path.exists(): - r = requests.get( - url=f"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/{local_path.name}", - ) - with local_path.open("w") as f: - f.write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("llava-multimodal-chatbot-optimum.ipynb") - -Convert and Optimize Model --------------------------- - - - -Our model conversion and optimization consist of following steps: 1. -Download original PyTorch model. 2. Convert model to OpenVINO format. 3. -Compress model weights using NNCF. - -Let’s consider each step more deeply. - -Convert model to OpenVINO IR format using Optimum CLI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate -Representation format. For convenience, we will use OpenVINO integration -with HuggingFace Optimum. `Optimum -Intel `__ is the -interface between the Transformers and Diffusers libraries and the -different tools and libraries provided by Intel to accelerate end-to-end -pipelines on Intel architectures. - -Among other use cases, Optimum Intel provides a simple interface to -optimize your Transformers and Diffusers models, convert them to the -OpenVINO Intermediate Representation (IR) format and run inference using -OpenVINO Runtime. ``optimum-cli`` provides command line interface for -model conversion and optimization. - -General command format: - -.. code:: bash - - optimum-cli export openvino --model --task - -where task is task to export the model for, if not specified, the task -will be auto-inferred based on the model. You can find a mapping between -tasks and model classes in Optimum TaskManager -`documentation `__. -Additionally, you can specify weights compression using -``--weight-format`` argument with one of following options: ``fp32``, -``fp16``, ``int8`` and ``int4``. For int8 and int4 -`nncf `__ will be used for -weight compression. More details about model export provided in `Optimum -Intel -documentation `__. - -.. code:: ipython3 - - from cmd_helper import optimum_cli - - model_id = "llava-hf/llava-1.5-7b-hf" - model_path = Path(model_id.split("/")[-1]) / "FP16" - - if not model_path.exists(): - optimum_cli(model_id, model_path, additional_args={"weight-format": "fp16"}) - -Compress Model weights to 4 and 8 bits using NNCF -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -For reducing memory consumption, weights compression optimization can be -applied using `NNCF `__. Weight -compression aims to reduce the memory footprint of a model. It can also -lead to significant performance improvement for large memory-bound -models, such as Large Language Models (LLMs). LLMs and other models, -which require extensive memory to store the weights during inference, -can benefit from weight compression in the following ways: - -- enabling the inference of exceptionally large models that cannot be - accommodated in the memory of the device; - -- improving the inference performance of the models by reducing the - latency of the memory access when computing the operations with - weights, for example, Linear layers. - -`Neural Network Compression Framework -(NNCF) `__ provides 4-bit / -8-bit mixed weight quantization as a compression method primarily -designed to optimize LLMs. The main difference between weights -compression and full model quantization (post-training quantization) is -that activations remain floating-point in the case of weights -compression which leads to a better accuracy. Weight compression for -LLMs provides a solid inference performance improvement which is on par -with the performance of the full model quantization. In addition, weight -compression is data-free and does not require a calibration dataset, -making it easy to use. - -``nncf.compress_weights`` function can be used for performing weights -compression. The function accepts an OpenVINO model and other -compression parameters. Compared to INT8 compression, INT4 compression -improves performance even more, but introduces a minor drop in -prediction quality. - -More details about weights compression, can be found in `OpenVINO -documentation `__. - -.. code:: ipython3 - - import ipywidgets as widgets - - compression_mode = widgets.Dropdown( - options=["INT4", "INT8"], - value="INT4", - description="Compression mode:", - disabled=False, - ) - - compression_mode - - - - -.. parsed-literal:: - - Dropdown(description='Compression mode:', options=('INT4', 'INT8'), value='INT4') - - - -.. code:: ipython3 - - import shutil - import nncf - import openvino as ov - import gc - - core = ov.Core() - - - def compress_model_weights(precision): - int4_compression_config = {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1, "all_layers": True} - int8_compression_config = {"mode": nncf.CompressWeightsMode.INT8_ASYM} - - compressed_model_path = model_path.parent / precision - - if not compressed_model_path.exists(): - ov_model = core.read_model(model_path / "openvino_language_model.xml") - compression_config = int4_compression_config if precision == "INT4" else int8_compression_config - compressed_ov_model = nncf.compress_weights(ov_model, **compression_config) - ov.save_model(compressed_ov_model, compressed_model_path / "openvino_language_model.xml") - del compressed_ov_model - del ov_model - gc.collect() - for file_name in model_path.glob("*"): - if file_name.name in ["openvino_language_model.xml", "openvino_language_model.bin"]: - continue - shutil.copy(file_name, compressed_model_path) - - - compress_model_weights(compression_mode.value) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino - - -Prepare OpenVINO based inference pipeline ------------------------------------------ - - - -OpenVINO integration with Optimum Intel provides ready-to-use API for -model inference that can be used for smooth integration with -transformers-based solutions. For loading model, we will use -``OVModelForVisualCausalLM`` class that have compatible interface with -Transformers LLaVA implementation. For loading a model, -``from_pretrained`` method should be used. It accepts path to the model -directory or model_id from HuggingFace hub (if model is not converted to -OpenVINO format, conversion will be triggered automatically). -Additionally, we can provide an inference device, quantization config -(if model has not been quantized yet) and device-specific OpenVINO -Runtime configuration. More details about model inference with Optimum -Intel can be found in -`documentation `__. - -.. code:: ipython3 - - from optimum.intel.openvino import OVModelForVisualCausalLM - -Run model inference -------------------- - - - -Now, when we have model and defined generation pipeline, we can run -model inference. - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -Select device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget(exclude=["NPU"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Select model variant -~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - model_base_path = model_path.parent - available_models = [] - - for precision in ["INT4", "INT8", "FP16"]: - if (model_base_path / precision).exists(): - available_models.append(precision) - - model_variant = widgets.Dropdown( - options=available_models, - value=available_models[0], - description="Compression mode:", - disabled=False, - ) - - model_variant - - - - -.. parsed-literal:: - - Dropdown(description='Compression mode:', options=('INT4', 'FP16'), value='INT4') - - - -Load OpenVINO model -~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - ov_model = OVModelForVisualCausalLM.from_pretrained(model_base_path / model_variant.value, device=device.value) - -Prepare input data -~~~~~~~~~~~~~~~~~~ - - - -For preparing input data, we will use tokenizer and image processor -defined in the begging of our tutorial. For alignment with original -PyTorch implementation we will use PyTorch tensors as input. - -.. code:: ipython3 - - import requests - from PIL import Image - from io import BytesIO - from transformers import AutoProcessor, AutoConfig - - config = AutoConfig.from_pretrained(model_path) - - processor = AutoProcessor.from_pretrained( - model_path, patch_size=config.vision_config.patch_size, vision_feature_select_strategy=config.vision_feature_select_strategy - ) - - - def load_image(image_file): - if image_file.startswith("http") or image_file.startswith("https"): - response = requests.get(image_file) - image = Image.open(BytesIO(response.content)).convert("RGB") - else: - image = Image.open(image_file).convert("RGB") - return image - - - image_url = "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11" - image_file = Path("cat.png") - text_message = "What is unusual on this image?" - - if not image_file.exists(): - image = load_image(image_url) - else: - image = load_image(image_file) - - conversation = [ - { - "role": "user", - "content": [ - {"type": "text", "text": text_message}, - {"type": "image"}, - ], - }, - ] - - prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) - - inputs = processor(images=image, text=prompt, return_tensors="pt") - -Test model inference -~~~~~~~~~~~~~~~~~~~~ - - - -Generation process for long response maybe time consuming, for accessing -partial result as soon as it is generated without waiting when whole -process finished, Streaming API can be used. Token streaming is the mode -in which the generative system returns the tokens one by one as the -model generates them. This enables showing progressive generations to -the user rather than waiting for the whole generation. Streaming is an -essential aspect of the end-user experience as it reduces latency, one -of the most critical aspects of a smooth experience. You can find more -details about how streaming work in `HuggingFace -documentation `__. - -Also for simplification of preparing input in conversational mode, we -will use Conversation Template helper provided by model authors for -accumulating history of provided messages and images. - -.. code:: ipython3 - - from transformers import TextStreamer - - # Prepare - streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True) - display(image) - print(f"Question: {text_message}") - print("Answer:") - - output_ids = ov_model.generate( - **inputs, - do_sample=False, - max_new_tokens=50, - streamer=streamer, - ) - - - -.. image:: llava-multimodal-chatbot-optimum-with-output_files/llava-multimodal-chatbot-optimum-with-output_20_0.png - - -.. parsed-literal:: - - Question: What is unusual on this image? - Answer: - The unusual aspect of this image is that a cat is lying inside a cardboard box, which is not a typical place for a cat to rest. Cats are known for their curiosity and love for small, enclosed spaces, but in this case - - -Interactive demo ----------------- - - - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llava-multimodal-chatbot/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo_llava_optimum - - demo = make_demo_llava_optimum(ov_model, processor) - - try: - demo.launch(debug=False) - except Exception: - demo.launch(share=True, debug=False) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/llava-multimodal-chatbot-optimum-with-output_files/llava-multimodal-chatbot-optimum-with-output_20_0.jpg b/docs/notebooks/llava-multimodal-chatbot-optimum-with-output_files/llava-multimodal-chatbot-optimum-with-output_20_0.jpg deleted file mode 100644 index c6aeec77cd3cb2..00000000000000 --- a/docs/notebooks/llava-multimodal-chatbot-optimum-with-output_files/llava-multimodal-chatbot-optimum-with-output_20_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5fc0d22d75f23474fb4f8aec8c0bf0fdf5d9377f3379e82a3887003e6da47e7e -size 60425 diff --git a/docs/notebooks/llava-multimodal-chatbot-optimum-with-output_files/llava-multimodal-chatbot-optimum-with-output_20_0.png b/docs/notebooks/llava-multimodal-chatbot-optimum-with-output_files/llava-multimodal-chatbot-optimum-with-output_20_0.png deleted file mode 100644 index c6673a757ab5dc..00000000000000 --- a/docs/notebooks/llava-multimodal-chatbot-optimum-with-output_files/llava-multimodal-chatbot-optimum-with-output_20_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c715d8adee4bf7519690de20b57ef2edaa2f914c86a64d107f99a919dcdad218 -size 854224 diff --git a/docs/notebooks/llava-next-multimodal-chatbot-with-output.rst b/docs/notebooks/llava-next-multimodal-chatbot-with-output.rst deleted file mode 100644 index d9bbff23ef232d..00000000000000 --- a/docs/notebooks/llava-next-multimodal-chatbot-with-output.rst +++ /dev/null @@ -1,454 +0,0 @@ -Visual-language assistant with LLaVA Next and OpenVINO -====================================================== - -`LLaVA-NeXT `__ -is new generation of LLaVA model family that marks breakthrough in -advanced language reasoning over images, introducing improved OCR and -expanded world knowledge. `LLaVA `__ (Large -Language and Vision Assistant) is large multimodal model that aims to -develop a general-purpose visual assistant that can follow both language -and image instructions to complete various real-world tasks. The idea is -to combine the power of large language models (LLMs) with vision -encoders like CLIP to create an end-to-end trained neural assistant that -understands and acts upon multimodal instructions. - -In this tutorial we consider how to convert and optimize LLaVA-NeXT -model from Transformers library for creating multimodal chatbot. We will -utilize the power of -`llava-v1.6-mistral-7b `__ -model for creating multimodal chatbot, but the similar actions are also -applicable to other models of LLaVA family compatible with HuggingFace -transformers implementation. Additionally, we demonstrate how to apply -stateful transformation on LLM part and model optimization techniques -like weights compression using -`NNCF `__ - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Convert model to OpenVINO IR format using Optimum - CLI <#convert-model-to-openvino-ir-format-using-optimum-cli>`__ -- `Compress Language Model Weights to 4 - bits <#compress-language-model-weights-to-4-bits>`__ -- `Prepare model inference - pipeline <#prepare-model-inference-pipeline>`__ - - - `Select device <#select-device>`__ - - `Select model variant <#select-model-variant>`__ - - `Load OpenVINO Model <#load-openvino-model>`__ - -- `Run OpenVINO model inference <#run-openvino-model-inference>`__ -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -- Recommend >= 32GB RAM to convert “llava-hf/llava-v1.6-mistral-7b-hf” - to OpenVino IR format. - -.. code:: ipython3 - - %pip install -q "nncf>=2.14.0" "torch>=2.1" "transformers>=4.39.1" "accelerate" "pillow" "gradio>=4.26" "datasets>=2.14.6" "tqdm" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q -U "openvino>=2024.5.0" "openvino-tokenizers>=2024.5.0" "openvino-genai>=2024.5" - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu - -.. code:: ipython3 - - from pathlib import Path - - import requests - - utility_files = ["notebook_utils.py", "cmd_helper.py"] - - for utility in utility_files: - local_path = Path(utility) - if not local_path.exists(): - r = requests.get( - url=f"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/{local_path.name}", - ) - with local_path.open("w") as f: - f.write(r.text) - - model_id = "llava-hf/llava-v1.6-mistral-7b-hf" - MODEL_DIR = Path(model_id.split("/")[-1].replace("-hf", "-ov")) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("llava-next-multimodal-chatbot.ipynb") - -Convert model to OpenVINO IR format using Optimum CLI ------------------------------------------------------ - - - -OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate -Representation (IR) format. For convenience, we will use OpenVINO -integration with HuggingFace Optimum. `Optimum -Intel `__ is the -interface between the Transformers and Diffusers libraries and the -different tools and libraries provided by Intel to accelerate end-to-end -pipelines on Intel architectures. - -Among other use cases, Optimum Intel provides a simple interface to -optimize your Transformers and Diffusers models, convert them to the -OpenVINO Intermediate Representation (IR) format and run inference using -OpenVINO Runtime. ``optimum-cli`` provides command line interface for -model conversion and optimization. - -General command format: - -.. code:: bash - - optimum-cli export openvino --model --task - -where task is task to export the model for, if not specified, the task -will be auto-inferred based on the model. You can find a mapping between -tasks and model classes in Optimum TaskManager -`documentation `__. -Additionally, you can specify weights compression using -``--weight-format`` argument with one of following options: ``fp32``, -``fp16``, ``int8`` and ``int4``. For int8 and int4 -`nncf `__ will be used for -weight compression. More details about model export provided in `Optimum -Intel -documentation `__. - -.. code:: ipython3 - - from cmd_helper import optimum_cli - - if not (MODEL_DIR / "FP16").exists(): - optimum_cli(model_id, MODEL_DIR / "FP16", additional_args={"weight-format": "fp16"}) - -Compress Language Model Weights to 4 bits ------------------------------------------ - - - -For reducing memory consumption, weights compression optimization can be -applied using `NNCF `__. Weight -compression aims to reduce the memory footprint of a model. It can also -lead to significant performance improvement for large memory-bound -models, such as Large Language Models (LLMs). - -LLMs and other models, which require extensive memory to store the -weights during inference, can benefit from weight compression in the -following ways: - -- enabling the inference of exceptionally large models that cannot be - accommodated in the memory of the device; - -- improving the inference performance of the models by reducing the - latency of the memory access when computing the operations with - weights, for example, Linear layers. - -`Neural Network Compression Framework -(NNCF) `__ provides 4-bit / -8-bit mixed weight quantization as a compression method primarily -designed to optimize LLMs. The main difference between weights -compression and full model quantization (post-training quantization) is -that activations remain floating-point in the case of weights -compression which leads to a better accuracy. Weight compression for -LLMs provides a solid inference performance improvement which is on par -with the performance of the full model quantization. In addition, weight -compression is data-free and does not require a calibration dataset, -making it easy to use. - -``nncf.compress_weights`` function can be used for performing weights -compression. The function accepts an OpenVINO model and other -compression parameters. Compared to INT8 compression, INT4 compression -improves performance even more, but introduces a minor drop in -prediction quality. - -More details about weights compression, can be found in `OpenVINO -documentation `__. - - **Note:** weights compression process may require additional time and - memory for performing. You can disable it using widget below: - -.. code:: ipython3 - - import ipywidgets as widgets - - to_compress_weights = widgets.Checkbox( - value=True, - description="Weights Compression", - disabled=False, - ) - - to_compress_weights - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Weights Compression') - - - -.. code:: ipython3 - - import shutil - import nncf - import openvino as ov - import gc - - compression_configuration = { - "mode": nncf.CompressWeightsMode.INT4_SYM, - "group_size": 64, - "ratio": 0.6, - } - - core = ov.Core() - - - def copy_model_folder(src, dst, ignore_file_names=None): - ignore_file_names = ignore_file_names or [] - - for file_name in Path(src).glob("*"): - if file_name.name in ignore_file_names: - continue - shutil.copy(file_name, dst / file_name.relative_to(src)) - - - LANGUAGE_MODEL_PATH_INT4 = MODEL_DIR / "INT4/openvino_language_model.xml" - LANGUAGE_MODEL_PATH = MODEL_DIR / "FP16/openvino_language_model.xml" - if to_compress_weights.value and not LANGUAGE_MODEL_PATH_INT4.exists(): - ov_model = core.read_model(LANGUAGE_MODEL_PATH) - ov_compressed_model = nncf.compress_weights(ov_model, **compression_configuration) - ov.save_model(ov_compressed_model, LANGUAGE_MODEL_PATH_INT4) - del ov_compressed_model - del ov_model - gc.collect() - - copy_model_folder(MODEL_DIR / "FP16", MODEL_DIR / "INT4", ["openvino_language_model.xml", "openvino_language_model.bin"]) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -Prepare model inference pipeline --------------------------------- - - - -|image0| - -`OpenVINO™ GenAI `__ -is a library of the most popular Generative AI model pipelines, -optimized execution methods, and samples that run on top of highly -performant `OpenVINO -Runtime `__. - -This library is friendly to PC and laptop execution, and optimized for -resource consumption. It requires no external dependencies to run -generative models as it already includes all the core functionality -(e.g. tokenization via openvino-tokenizers). OpenVINO™ GenAI is a flavor -of OpenVINO™, aiming to simplify running inference of generative AI -models. It hides the complexity of the generation process and minimizes -the amount of code required. - -Inference Visual language models can be implemented using OpenVINO GenAI -``VLMPipeline`` class. Similarly to LLMPipeline, that we discussed in -this -`notebook `__. -It supports chat mode with preserving conversational history inside -pipeline, that allows us effectively implements chatbot that supports -conversation about provided images content. - -.. |image0| image:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/a562e9de-5b94-4e24-ac52-532019fc92d3 - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -Select device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget("CPU", exclude=["NPU"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -Select model variant -~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - import ipywidgets as widgets - - use_int4_lang_model = widgets.Checkbox( - value=LANGUAGE_MODEL_PATH_INT4.exists(), - description="INT4 language model", - disabled=not LANGUAGE_MODEL_PATH_INT4.exists(), - ) - - use_int4_lang_model - - - - -.. parsed-literal:: - - Checkbox(value=True, description='INT4 language model') - - - -Load OpenVINO model -~~~~~~~~~~~~~~~~~~~ - - - -For pipeline initialization we should provide path to model directory -and inference device. - -.. code:: ipython3 - - import openvino_genai as ov_genai - - model_dir = MODEL_DIR / "FP16" if not use_int4_lang_model.value else MODEL_DIR / "INT4" - - ov_model = ov_genai.VLMPipeline(model_dir, device=device.value) - -Run OpenVINO model inference ----------------------------- - - - -Now, when we have model and defined generation pipeline, we can run -model inference. - -For preparing input data, ``VLMPipeline`` use tokenizer and image -processor inside, we just need to convert image to input OpenVINO tensor -and provide question as string. Additionally, we can provides options -for controlling generation process (e.g. number of maximum generated -tokens or using multinomial sampling for decoding instead of greedy -search approach) using ``GenerationConfig``. - -Generation process for long response may be time consuming, for -accessing partial result as soon as it is generated without waiting when -whole process finished, Streaming API can be used. Token streaming is -the mode in which the generative system returns the tokens one by one as -the model generates them. This enables showing progressive generations -to the user rather than waiting for the whole generation. Streaming is -an essential aspect of the end-user experience as it reduces latency, -one of the most critical aspects of a smooth experience. - -.. code:: ipython3 - - import requests - from PIL import Image - from io import BytesIO - import numpy as np - - config = ov_genai.GenerationConfig() - config.max_new_tokens = 100 - - - def load_image(image_file): - if image_file.startswith("http") or image_file.startswith("https"): - response = requests.get(image_file) - image = Image.open(BytesIO(response.content)).convert("RGB") - else: - image = Image.open(image_file).convert("RGB") - image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.byte) - return image, ov.Tensor(image_data) - - - def streamer(subword: str) -> bool: - """ - - Args: - subword: sub-word of the generated text. - - Returns: Return flag corresponds whether generation should be stopped. - - """ - print(subword, end="", flush=True) - - - image_url = "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11" - image_file = Path("cat.png") - - if not image_file.exists(): - image, image_tensor = load_image(image_url) - image.save(image_file) - else: - image, image_tensor = load_image(image_file) - text_message = "What is unusual on this image?" - - prompt = text_message - - display(image) - print(f"Question:\n{text_message}") - print("Answer:") - output = ov_model.generate(prompt, image=image_tensor, generation_config=config, streamer=streamer) - - - -.. image:: llava-next-multimodal-chatbot-with-output_files/llava-next-multimodal-chatbot-with-output_18_0.png - - -.. parsed-literal:: - - Question: - What is unusual on this image? - Answer: - - - The unusual aspect of this image is that a cat is lying inside a cardboard box. Cats are known for their curiosity and love for small, enclosed spaces. They often find comfort and security in boxes, bags, or other confined spaces. In this case, the cat has chosen to lie down in a cardboard box, which is an unconventional and amusing sight. It is not common to see a cat lounging in a box, as they usually - -Interactive demo ----------------- - - - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llava-next-multimodal-chatbot/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(ov_model) - - try: - demo.launch(debug=False) - except Exception: - demo.launch(debug=False, share=True) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/llava-next-multimodal-chatbot-with-output_files/llava-next-multimodal-chatbot-with-output_18_0.jpg b/docs/notebooks/llava-next-multimodal-chatbot-with-output_files/llava-next-multimodal-chatbot-with-output_18_0.jpg deleted file mode 100644 index c6aeec77cd3cb2..00000000000000 --- a/docs/notebooks/llava-next-multimodal-chatbot-with-output_files/llava-next-multimodal-chatbot-with-output_18_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5fc0d22d75f23474fb4f8aec8c0bf0fdf5d9377f3379e82a3887003e6da47e7e -size 60425 diff --git a/docs/notebooks/llava-next-multimodal-chatbot-with-output_files/llava-next-multimodal-chatbot-with-output_18_0.png b/docs/notebooks/llava-next-multimodal-chatbot-with-output_files/llava-next-multimodal-chatbot-with-output_18_0.png deleted file mode 100644 index c6673a757ab5dc..00000000000000 --- a/docs/notebooks/llava-next-multimodal-chatbot-with-output_files/llava-next-multimodal-chatbot-with-output_18_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c715d8adee4bf7519690de20b57ef2edaa2f914c86a64d107f99a919dcdad218 -size 854224 diff --git a/docs/notebooks/llm-agent-functioncall-qwen-with-output.rst b/docs/notebooks/llm-agent-functioncall-qwen-with-output.rst deleted file mode 100644 index d3ae36b0e9e714..00000000000000 --- a/docs/notebooks/llm-agent-functioncall-qwen-with-output.rst +++ /dev/null @@ -1,527 +0,0 @@ -Create Function-calling Agent using OpenVINO and Qwen-Agent -=========================================================== - -LLM are limited to the knowledge on which they have been trained and the -additional knowledge provided as context, as a result, if a useful piece -of information is missing the provided knowledge, the model cannot “go -around” and try to find it in other sources. This is the reason why we -need to introduce the concept of Agents. - -The core idea of agents is to use a language model to choose a sequence -of actions to take. In agents, a language model is used as a reasoning -engine to determine which actions to take and in which order. Agents can -be seen as applications powered by LLMs and integrated with a set of -tools like search engines, databases, websites, and so on. Within an -agent, the LLM is the reasoning engine that, based on the user input, is -able to plan and execute a set of actions that are needed to fulfill the -request. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/91237924/22fa5396-8381-400f-a78f-97e25d57d807 - :alt: agent - - agent - -`Qwen-Agent `__ is a framework for -developing LLM applications based on the instruction following, tool -usage, planning, and memory capabilities of Qwen. It also comes with -example applications such as Browser Assistant, Code Interpreter, and -Custom Assistant. - -This notebook explores how to create a Function calling Agent step by -step using OpenVINO and Qwen-Agent. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Create a Function calling - agent <#create-a-function-calling-agent>`__ - - - `Create functions <#create-functions>`__ - - `Download model <#download-model>`__ - - `Select inference device for - LLM <#select-inference-device-for-llm>`__ - - `Create LLM for Qwen-Agent <#create-llm-for-qwen-agent>`__ - - `Create Function-calling - pipeline <#create-function-calling-pipeline>`__ - -- `Interactive Demo <#interactive-demo>`__ - - - `Create tools <#create-tools>`__ - - `Create AI agent demo with Qwen-Agent and Gradio - UI <#create-ai-agent-demo-with-qwen-agent-and-gradio-ui>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import os - from pathlib import Path - import requests - - os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - - %pip install -Uq pip - %pip uninstall -q -y optimum optimum-intel - %pip install --pre -Uq "openvino>=2024.2.0" openvino-tokenizers[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu \ - "torch>=2.1" \ - "datasets" \ - "accelerate" \ - "qwen-agent[gui]==0.0.7" "transformers>=4.38.1" "gradio==4.21.0", "modelscope-studio>=0.4.0,<1.0.0" "langchain>=0.2.3" "langchain-community>=0.2.4" "wikipedia" - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu \ - "git+https://github.com/huggingface/optimum-intel.git" \ - "git+https://github.com/openvinotoolkit/nncf.git" - - utility_files = ["notebook_utils.py", "cmd_helper.py"] - - for utility in utility_files: - local_path = Path(utility) - if not local_path.exists(): - r = requests.get( - url=f"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/{local_path.name}", - ) - with local_path.open("w") as f: - f.write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("llm-agent-functioncall-qwen.ipynb") - -Create a Function calling agent -------------------------------- - - - -Function calling allows a model to detect when one or more tools should -be called and respond with the inputs that should be passed to those -tools. In an API call, you can describe tools and have the model -intelligently choose to output a structured object like JSON containing -arguments to call these tools. The goal of tools APIs is to more -reliably return valid and useful tool calls than what can be done using -a generic text completion or chat API. - -We can take advantage of this structured output, combined with the fact -that you can bind multiple tools to a tool calling chat model and allow -the model to choose which one to call, to create an agent that -repeatedly calls tools and receives results until a query is resolved. - -Create a function -~~~~~~~~~~~~~~~~~ - - - -First, we need to create a example function/tool for getting the -information of current weather. - -.. code:: ipython3 - - import json - - - def get_current_weather(location, unit="fahrenheit"): - """Get the current weather in a given location""" - if "tokyo" in location.lower(): - return json.dumps({"location": "Tokyo", "temperature": "10", "unit": "celsius"}) - elif "san francisco" in location.lower(): - return json.dumps({"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"}) - elif "paris" in location.lower(): - return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"}) - else: - return json.dumps({"location": location, "temperature": "unknown"}) - -Wrap the function’s name and description into a json list, and it will -help LLM to find out which function should be called for current task. - -.. code:: ipython3 - - functions = [ - { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The city and state, e.g. San Francisco, CA", - }, - "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, - }, - "required": ["location"], - }, - } - ] - -Download model -~~~~~~~~~~~~~~ - - - -Large Language Models (LLMs) are a core component of Agent. In this -example, we will demonstrate how to create a OpenVINO LLM model in -Qwen-Agent framework. Since Qwen2 can support function calling during -text generation, we select ``Qwen/Qwen2-7B-Instruct`` as LLM in agent -pipeline. - -- **Qwen/Qwen2-7B-Instruct** - Qwen2 is the new series of Qwen large - language models. Compared with the state-of-the-art open source - language models, including the previous released Qwen1.5, Qwen2 has - generally surpassed most open source models and demonstrated - competitiveness against proprietary models across a series of - benchmarks targeting for language understanding, language generation, - multilingual capability, coding, mathematics, reasoning, etc. `Model - Card `__ - -To run LLM locally, we have to download the model in the first step. It -is possible to `export your -model `__ -to the OpenVINO IR format with the CLI, and load the model from local -folder. - -.. code:: ipython3 - - from pathlib import Path - from cmd_helper import optimum_cli - - model_id = "Qwen/Qwen2-7B-Instruct" - model_path = "Qwen2-7B-Instruct-ov" - - if not Path(model_path).exists(): - optimum_cli(model_id, model_path, additional_args={"task": "text-generation-with-past", "trust-remote-code": "", "weight-format": "int4", "ratio": "0.72"}) - -Select inference device for LLM -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget("CPU", ["NPU"]) - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -Create LLM for Qwen-Agent -~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -OpenVINO has been integrated into the ``Qwen-Agent`` framework. You can -use following method to create a OpenVINO based LLM for a ``Qwen-Agent`` -pipeline. - -.. code:: ipython3 - - from qwen_agent.llm import get_chat_model - - import openvino.properties as props - import openvino.properties.hint as hints - import openvino.properties.streams as streams - - - ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""} - llm_cfg = { - "ov_model_dir": model_path, - "model_type": "openvino", - "device": device.value, - "ov_config": ov_config, - # (Optional) LLM hyperparameters for generation: - "generate_cfg": {"top_p": 0.8}, - } - llm = get_chat_model(llm_cfg) - - -.. parsed-literal:: - - Compiling the model to CPU ... - Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. - - -You can get additional inference speed improvement with `Dynamic -Quantization of activations and KV-cache quantization on -CPU `__. -These options can be enabled with ``ov_config`` as follows: - -.. code:: ipython3 - - ov_config = { - "KV_CACHE_PRECISION": "u8", - "DYNAMIC_QUANTIZATION_GROUP_SIZE": "32", - hints.performance_mode(): hints.PerformanceMode.LATENCY, - streams.num(): "", - props.cache_dir(): "", - } - -Create Function-calling pipeline --------------------------------- - - - -After defining the functions and LLM, we can build the agent pipeline -with capability of function calling. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/91237924/3170ca30-23af-4a1a-a655-1d0d67df2ded - :alt: functioncalling - - functioncalling - -The workflow of Qwen2 function calling consists of several steps: - -1. Role ``user`` sending the request. -2. Check if the model wanted to call a function, and call the function - if needed -3. Get the observation from ``function``\ ’s results. -4. Consolidate the observation into final response of ``assistant``. - -A typical multi-turn dialogue structure is as follows: - -- **Query**: - ``{'role': 'user', 'content': 'create a picture of cute cat'},`` - -- **Function calling**: - ``{'role': 'assistant', 'content': '', 'function_call': {'name': 'my_image_gen', 'arguments': '{"prompt": "a cute cat"}'}},`` - -- **Observation**: - ``{'role': 'function', 'content': '{"image_url": "https://image.pollinations.ai/prompt/a%20cute%20cat"}', 'name': 'my_image_gen'}`` - -- **Final Response**: - ``{'role': 'assistant', 'content': "Here is the image of a cute cat based on your description:\n\n![](https://image.pollinations.ai/prompt/a%20cute%20cat)."}`` - -.. code:: ipython3 - - print("# User question:") - messages = [{"role": "user", "content": "What's the weather like in San Francisco?"}] - print(messages) - - print("# Assistant Response 1:") - responses = [] - - # Step 1: Role `user` sending the request - responses = llm.chat( - messages=messages, - functions=functions, - stream=False, - ) - print(responses) - - messages.extend(responses) - - # Step 2: check if the model wanted to call a function, and call the function if needed - last_response = messages[-1] - if last_response.get("function_call", None): - available_functions = { - "get_current_weather": get_current_weather, - } # only one function in this example, but you can have multiple - function_name = last_response["function_call"]["name"] - function_to_call = available_functions[function_name] - function_args = json.loads(last_response["function_call"]["arguments"]) - function_response = function_to_call( - location=function_args.get("location"), - ) - print("# Function Response:") - print(function_response) - - # Step 3: Get the observation from `function`'s results - messages.append( - { - "role": "function", - "name": function_name, - "content": function_response, - } - ) - - print("# Assistant Response 2:") - # Step 4: Consolidate the observation from function into final response - responses = llm.chat( - messages=messages, - functions=functions, - stream=False, - ) - print(responses) - - -.. parsed-literal:: - - # User question: - [{'role': 'user', 'content': "What's the weather like in San Francisco?"}] - # Assistant Response 1: - [{'role': 'assistant', 'content': '', 'function_call': {'name': 'get_current_weather', 'arguments': '{"location": "San Francisco, CA"}'}}] - # Function Response: - {"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"} - # Assistant Response 2: - [{'role': 'assistant', 'content': 'The current weather in San Francisco is 72 degrees Fahrenheit.'}] - - -Interactive Demo ----------------- - - - -Let’s create a interactive agent using -`Gradio `__. - -Create tools -~~~~~~~~~~~~ - - - -Qwen-Agent provides a mechanism for `registering -tools `__. -For example, to register your own image generation tool: - -- Specify the tool’s name, description, and parameters. Note that the - string passed to ``@register_tool('my_image_gen')`` is automatically - added as the ``.name`` attribute of the class and will serve as the - unique identifier for the tool. -- Implement the ``call(...)`` function. - -In this notebook, we will create 3 tools as examples: - -- **image_generation**: AI painting (image generation) service, input text - description, and return the image URL drawn based on text information. -- **get_current_weather**: Get the current weather in a given city name. -- **wikipedia**: A wrapper around Wikipedia. Useful for when you need to - answer general questions about people, places, companies, facts, - historical events, or other subjects. - -.. code:: ipython3 - - import urllib.parse - import json5 - import requests - from qwen_agent.tools.base import BaseTool, register_tool - - - @register_tool("image_generation") - class ImageGeneration(BaseTool): - description = "AI painting (image generation) service, input text description, and return the image URL drawn based on text information." - parameters = [{"name": "prompt", "type": "string", "description": "Detailed description of the desired image content, in English", "required": True}] - - def call(self, params: str, **kwargs) -> str: - prompt = json5.loads(params)["prompt"] - prompt = urllib.parse.quote(prompt) - return json5.dumps({"image_url": f"https://image.pollinations.ai/prompt/{prompt}"}, ensure_ascii=False) - - - @register_tool("get_current_weather") - class GetCurrentWeather(BaseTool): - description = "Get the current weather in a given city name." - parameters = [{"name": "city_name", "type": "string", "description": "The city and state, e.g. San Francisco, CA", "required": True}] - - def call(self, params: str, **kwargs) -> str: - # `params` are the arguments generated by the LLM agent. - city_name = json5.loads(params)["city_name"] - key_selection = { - "current_condition": [ - "temp_C", - "FeelsLikeC", - "humidity", - "weatherDesc", - "observation_time", - ], - } - resp = requests.get(f"https://wttr.in/{city_name}?format=j1") - resp.raise_for_status() - resp = resp.json() - ret = {k: {_v: resp[k][0][_v] for _v in v} for k, v in key_selection.items()} - return str(ret) - - - @register_tool("wikipedia") - class Wikipedia(BaseTool): - description = "A wrapper around Wikipedia. Useful for when you need to answer general questions about people, places, companies, facts, historical events, or other subjects." - parameters = [{"name": "query", "type": "string", "description": "Query to look up on wikipedia", "required": True}] - - def call(self, params: str, **kwargs) -> str: - # `params` are the arguments generated by the LLM agent. - from langchain.tools import WikipediaQueryRun - from langchain_community.utilities import WikipediaAPIWrapper - - query = json5.loads(params)["query"] - wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper(top_k_results=2, doc_content_chars_max=1000)) - resutlt = wikipedia.run(query) - return str(resutlt) - -.. code:: ipython3 - - tools = ["image_generation", "get_current_weather", "wikipedia"] - -Create AI agent demo with Qwen-Agent and Gradio UI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The Agent class serves as a higher-level interface for Qwen-Agent, where -an Agent object integrates the interfaces for tool calls and LLM (Large -Language Model). The Agent receives a list of messages as input and -produces a generator that yields a list of messages, effectively -providing a stream of output messages. - -Qwen-Agent offers a generic Agent class: the ``Assistant`` class, which, -when directly instantiated, can handle the majority of Single-Agent -tasks. Features: - -- It supports role-playing. -- It provides automatic planning and tool calls abilities. -- RAG (Retrieval-Augmented Generation): It accepts documents input, and - can use an integrated RAG strategy to parse the documents. - -.. code:: ipython3 - - from qwen_agent.agents import Assistant - - bot = Assistant(llm=llm_cfg, function_list=tools, name="OpenVINO Agent") - - -.. parsed-literal:: - - Compiling the model to CPU ... - Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. - - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llm-agent-functioncall/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(bot=bot) - - try: - demo.run() - except Exception: - demo.run(share=True) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ - -.. code:: ipython3 - - # please uncomment and run this cell for stopping gradio interface - # demo.close() diff --git a/docs/notebooks/llm-agent-rag-llamaindex-with-output.rst b/docs/notebooks/llm-agent-rag-llamaindex-with-output.rst deleted file mode 100644 index 0bcd98b05a7906..00000000000000 --- a/docs/notebooks/llm-agent-rag-llamaindex-with-output.rst +++ /dev/null @@ -1,514 +0,0 @@ -Create an Agentic RAG using OpenVINO and LlamaIndex -=================================================== - -An **agent** is an automated reasoning and decision engine. It takes in -a user input/query and can make internal decisions for executing that -query in order to return the correct result. The key agent components -can include, but are not limited to: - -- Breaking down a complex question into smaller ones -- Choosing an external Tool to use + coming up with parameters for - calling the Tool -- Planning out a set of tasks -- Storing previously completed tasks in a memory module - -`LlamaIndex `__ is a framework -for building context-augmented generative AI applications with -LLMs.LlamaIndex imposes no restriction on how you use LLMs. You can use -LLMs as auto-complete, chatbots, semi-autonomous agents, and more. It -just makes using them easier. You can build agents on top of your -existing LlamaIndex RAG pipeline to empower it with automated decision -capabilities. A lot of modules (routing, query transformations, and -more) are already agentic in nature in that they use LLMs for decision -making. - -**Agentic RAG = Agent-based RAG implementation** - -While standard RAG excels at simple queries across a few documents, -agentic RAG takes it a step further and emerges as a potent solution for -question answering. It introduces a layer of intelligence by employing -AI agents. These agents act as autonomous decision-makers, analyzing -initial findings and strategically selecting the most effective tools -for further data retrieval. This multi-step reasoning capability -empowers agentic RAG to tackle intricate research tasks, like -summarizing, comparing information across multiple documents and even -formulating follow-up questions -all in an orchestrated and efficient -manner. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/91237924/871cb90d-27fd-4a87-aa3c-f4cdb199a148 - :alt: agentic-rag - - agentic-rag - -This example will demonstrate using RAG engines as a tool in an agent -with OpenVINO and LlamaIndex. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Download models <#download-models>`__ - - - `Download LLM <#download-llm>`__ - - `Download Embedding model <#download-embedding-model>`__ - -- `Create models <#create-models>`__ - - - `Create OpenVINO LLM <#create-openvino-llm>`__ - - `Create OpenVINO Embedding <#create-openvino-embedding>`__ - -- `Create tools <#create-tools>`__ -- `Run Agentic RAG <#run-agentic-rag>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Install required dependencies - -.. code:: ipython3 - - import os - import requests - from pathlib import Path - - utility_files = ["notebook_utils.py", "cmd_helper.py", "pip_helper.py"] - - for utility in utility_files: - local_path = Path(utility) - if not local_path.exists(): - r = requests.get( - url=f"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/{local_path.name}", - ) - with local_path.open("w") as f: - f.write(r.text) - - os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - - from pip_helper import pip_install - - pip_install( - "-q", - "--extra-index-url", - "https://download.pytorch.org/whl/cpu", - "llama-index", - "llama-index-readers-file", - "llama-index-core", - "transformers>=4.43.1", - "llama-index-llms-huggingface>=0.4.0,<0.5.0", - "llama-index-embeddings-huggingface>=0.4.0,<0.5.0", - "huggingface-hub>=0.26.5", - ) - pip_install("-q", "git+https://github.com/huggingface/optimum-intel.git", "git+https://github.com/openvinotoolkit/nncf.git", "datasets", "accelerate") - pip_install("--pre", "-Uq", "openvino>=2024.2.0", "--extra-index-url", "https://storage.openvinotoolkit.org/simple/wheels/nightly") - pip_install("--pre", "-Uq", "openvino-tokenizers[transformers]", "--extra-index-url", "https://storage.openvinotoolkit.org/simple/wheels/nightly") - pip_install("-q", "--no-deps", "llama-index-llms-openvino", "llama-index-embeddings-openvino") - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("llm-agent-rag-llamaindex.ipynb") - -.. code:: ipython3 - - from pathlib import Path - import requests - import io - - - text_example_en_path = Path("text_example_en.pdf") - text_example_en = "https://github.com/user-attachments/files/16171326/xeon6-e-cores-network-and-edge-brief.pdf" - - if not text_example_en_path.exists(): - r = requests.get(url=text_example_en) - content = io.BytesIO(r.content) - with open("text_example_en.pdf", "wb") as f: - f.write(content.read()) - -Download models ---------------- - - - -Download LLM -~~~~~~~~~~~~ - - - -To run LLM locally, we have to download the model in the first step. It -is possible to `export your -model `__ -to the OpenVINO IR format with the CLI, and load the model from local -folder. - -Large Language Models (LLMs) are a core component of agent. LlamaIndex -does not serve its own LLMs, but rather provides a standard interface -for interacting with many different LLMs. In this example, we can select -``Phi3-mini-instruct`` or ``Meta-Llama-3-8B-Instruct`` as LLM in agent -pipeline. \* **phi3-mini-instruct** - The Phi-3-Mini is a 3.8B -parameters, lightweight, state-of-the-art open model trained with the -Phi-3 datasets that includes both synthetic data and the filtered -publicly available websites data with a focus on high-quality and -reasoning dense properties. More details about model can be found in -`model -card `__, -`Microsoft blog `__ and `technical -report `__. \* -**llama-3.1-8b-instruct** - The Llama 3.1 instruction tuned text only -models (8B, 70B, 405B) are optimized for multilingual dialogue use cases -and outperform many of the available open source and closed chat models -on common industry benchmarks. More details about model can be found in -`Meta blog post `__, `model -website `__ and `model -card `__. ->\ **Note**: run model with demo, you will need to accept license -agreement. >You must be a registered user in Hugging Face Hub. Please -visit `HuggingFace model -card `__, -carefully read terms of usage and click accept button. You will need to -use an access token for the code below to run. For more information on -access tokens, refer to `this section of the -documentation `__. >You -can login on Hugging Face Hub in notebook environment, using following -code: - -.. code:: python - - ## login to huggingfacehub to get access to pretrained model - - from huggingface_hub import notebook_login, whoami - - try: - whoami() - print('Authorization token already provided') - except OSError: - notebook_login() - -.. code:: ipython3 - - import ipywidgets as widgets - - llm_model_ids = ["OpenVINO/Phi-3-mini-4k-instruct-int4-ov", "meta-llama/Meta-Llama-3.1-8B-Instruct"] - - llm_model_id = widgets.Dropdown( - options=llm_model_ids, - value=llm_model_ids[0], - description="Model:", - disabled=False, - ) - - llm_model_id - - - - -.. parsed-literal:: - - Dropdown(description='Model:', options=('OpenVINO/Phi-3-mini-4k-instruct-int4-ov', 'meta-llama/Meta-Llama-3.1-… - - - -.. code:: ipython3 - - from pathlib import Path - import huggingface_hub as hf_hub - from cmd_helper import optimum_cli - - llm_model_path = llm_model_id.value.split("/")[-1] - repo_name = llm_model_id.value.split("/")[0] - - if not Path(llm_model_path).exists(): - if repo_name == "OpenVINO": - hf_hub.snapshot_download(llm_model_id.value, local_dir=llm_model_path) - else: - optimum_cli(llm_model_id.value, llm_model_path, additional_args={"task": "text-generation-with-past", "weight-format": "int4"}) - -Download Embedding model -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Embedding model is another key component in RAG pipeline. It takes text -as input, and return a long list of numbers used to capture the -semantics of the text. An OpenVINO embedding model and tokenizer can be -exported by ``feature-extraction`` task with ``optimum-cli``. In this -tutorial, we use -`bge-small-en-v1.5 `__ as -example. - -.. code:: ipython3 - - embedding_model_id = "BAAI/bge-small-en-v1.5" - embedding_model_path = "bge-small-en-v1.5" - - if not Path(embedding_model_path).exists(): - optimum_cli(embedding_model_id, embedding_model_path, additional_args={"task": "feature-extraction"}) - -Create models -------------- - - - -Create OpenVINO LLM -~~~~~~~~~~~~~~~~~~~ - - - -Select device for LLM model inference - -.. code:: ipython3 - - from notebook_utils import device_widget - - llm_device = device_widget("CPU", exclude=["NPU"]) - - llm_device - -OpenVINO models can be run locally through the ``OpenVINOLLM`` class in -`LlamaIndex `__. -If you have an Intel GPU, you can specify ``device_map="gpu"`` to run -inference on it. - -.. code:: ipython3 - - from llama_index.llms.openvino import OpenVINOLLM - - import openvino.properties as props - import openvino.properties.hint as hints - import openvino.properties.streams as streams - - - ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""} - - - def phi_completion_to_prompt(completion): - return f"<|system|><|end|><|user|>{completion}<|end|><|assistant|>\n" - - - def llama3_completion_to_prompt(completion): - return f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{completion}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" - - - llm = OpenVINOLLM( - model_id_or_path=str(llm_model_path), - context_window=3900, - max_new_tokens=1000, - model_kwargs={"ov_config": ov_config}, - generate_kwargs={"do_sample": False, "temperature": None, "top_p": None}, - completion_to_prompt=phi_completion_to_prompt if llm_model_path == "Phi-3-mini-4k-instruct-int4-ov" else llama3_completion_to_prompt, - device_map=llm_device.value, - ) - - -.. parsed-literal:: - - Compiling the model to CPU ... - - -Create OpenVINO Embedding -~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Select device for embedding model inference - -.. code:: ipython3 - - embedding_device = device_widget() - - embedding_device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -A Hugging Face embedding model can be supported by OpenVINO through -`OpenVINOEmbeddings `__ -class of LlamaIndex. - -.. code:: ipython3 - - from llama_index.embeddings.huggingface_openvino import OpenVINOEmbedding - - embedding = OpenVINOEmbedding(model_id_or_path=embedding_model_path, device=embedding_device.value) - - -.. parsed-literal:: - - Compiling the model to CPU ... - - -Create tools ------------- - - - -In this examples, we will create 2 customized tools for ``multiply`` and -``add``. - -.. code:: ipython3 - - from llama_index.core.agent import ReActAgent - from llama_index.core.tools import FunctionTool - - - def multiply(a: float, b: float) -> float: - """Multiply two numbers and returns the product""" - return a * b - - - multiply_tool = FunctionTool.from_defaults(fn=multiply) - - - def divide(a: float, b: float) -> float: - """Add two numbers and returns the sum""" - return a / b - - - divide_tool = FunctionTool.from_defaults(fn=divide) - -To demonstrate using RAG engines as a tool in an agent, we’re going to -create a very simple RAG query engine as one of the tools. - - **Note**: For a full RAG pipeline with OpenVINO, you can check the - `RAG notebooks `__ - -.. code:: ipython3 - - from llama_index.core import SimpleDirectoryReader - from llama_index.core import VectorStoreIndex, Settings - - Settings.embed_model = embedding - Settings.llm = llm - - reader = SimpleDirectoryReader(input_files=[text_example_en_path]) - documents = reader.load_data() - index = VectorStoreIndex.from_documents( - documents, - ) - -Now we turn our query engine into a tool by supplying the appropriate -metadata (for the python functions, this was being automatically -extracted so we didn’t need to add it): - -.. code:: ipython3 - - from llama_index.core.tools import QueryEngineTool, ToolMetadata - - vector_tool = QueryEngineTool( - index.as_query_engine(streaming=True), - metadata=ToolMetadata( - name="vector_search", - description="Useful for searching for basic facts about 'Intel Xeon 6 processors'", - ), - ) - -Run Agentic RAG ---------------- - - - -We modify our agent by adding this engine to our array of tools (we also -remove the llm parameter, since it’s now provided by settings): - -.. code:: ipython3 - - agent = ReActAgent.from_tools([multiply_tool, divide_tool, vector_tool], llm=llm, verbose=True) - -Ask a question using multiple tools. - -.. code:: ipython3 - - response = agent.chat("What's the maximum number of cores of 8 sockets of 'Intel Xeon 6 processors' ? Go step by step, using a tool to do any math.") - - -.. parsed-literal:: - - Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation. - - -.. parsed-literal:: - - > Running step ee829c21-5642-423d-afcf-27e894aede35. Step input: What's the maximum number of cores of 8 sockets of 'Intel Xeon 6 processors' ? Go step by step, using a tool to do any math. - - -.. parsed-literal:: - - Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation. - - -.. parsed-literal:: - - Thought: The current language of the user is English. I need to use a tool to help me answer the question. - Action: vector_search - Action Input: {'input': 'Intel Xeon 6 processors'} - - -.. parsed-literal:: - - Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation. - - -.. parsed-literal:: - - Observation: According to the provided text, Intel Xeon 6 processors with Efficient-cores are described as having the following features and benefits: - - * Up to 144 cores per socket in 1- or 2-socket configurations, boosting processing capacity, accelerating service mesh performance, and decreasing transaction latency. - * Improved power efficiency and lower idle power ISO configurations, contributing to enhanced sustainability with a TDP range of 205W-330W. - * Intel QuickAssist Technology (Intel QAT) drives fast encryption/key protection, while Intel Software Guard Extensions (Intel SGX) and Intel Trust Domain Extensions (Intel TDX) enable confidential computing for regulated workloads. - * Intel Xeon 6 processor-based platforms with Intel Ethernet 800 Series Network Adapters set the bar for maximum 5G core workload performance and lower operating costs. - - These processors are suitable for various industries, including: - - * Telecommunications: 5G core networks, control plane (CP), and user plane functions (UPF) - * Enterprise: Network security appliances, secure access service edge (SASE), next-gen firewall (NGFW), real-time deep packet inspection, antivirus, intrusion prevention and detection, and SSL/TLS inspection - * Media and Entertainment: Content delivery networks, media processing, video on demand (VOD) - * Industrial/Energy: Digitalization of automation, protection, and control - - The processors are also mentioned to be suitable for various use cases, including: - - * 5G core networks - * Network security appliances - * Content delivery networks - * Media processing - * Video on demand (VOD) - * Digitalization of automation, protection, and control in industrial and energy sectors - > Running step c8d3f8b5-0a3e-4254-87a8-c13cd4f992ad. Step input: None - - -.. parsed-literal:: - - Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation. - - -.. parsed-literal:: - - Thought: The current language of the user is English. I need to use a tool to help me answer the question. - Action: multiply - Action Input: {'a': 8, 'b': 144} - Observation: 1152 - > Running step 437a7fcf-7f53-4d7c-b3d4-06b2714a1b9d. Step input: None - Thought: The current language of the user is English. I can answer without using any more tools. I'll use the user's language to answer. - Answer: The maximum number of cores of 8 sockets of 'Intel Xeon 6 processors' is 1152. - - -.. code:: ipython3 - - agent.reset() diff --git a/docs/notebooks/llm-agent-react-langchain-with-output.rst b/docs/notebooks/llm-agent-react-langchain-with-output.rst deleted file mode 100644 index c6c5d70a48a36d..00000000000000 --- a/docs/notebooks/llm-agent-react-langchain-with-output.rst +++ /dev/null @@ -1,838 +0,0 @@ -Create ReAct Agent using OpenVINO and LangChain -=============================================== - -LLM are limited to the knowledge on which they have been trained and the -additional knowledge provided as context, as a result, if a useful piece -of information is missing the provided knowledge, the model cannot “go -around” and try to find it in other sources. This is the reason why we -need to introduce the concept of Agents. - -The core idea of agents is to use a language model to choose a sequence -of actions to take. In agents, a language model is used as a reasoning -engine to determine which actions to take and in which order. Agents can -be seen as applications powered by LLMs and integrated with a set of -tools like search engines, databases, websites, and so on. Within an -agent, the LLM is the reasoning engine that, based on the user input, is -able to plan and execute a set of actions that are needed to fulfill the -request. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/91237924/22fa5396-8381-400f-a78f-97e25d57d807 - :alt: agent - - agent - -`LangChain `__ -is a framework for developing applications powered by language models. -LangChain comes with a number of built-in agents that are optimized for -different use cases. - -This notebook explores how to create an AI Agent step by step using -OpenVINO and LangChain. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Create tools <#create-tools>`__ -- `Create prompt template <#create-prompt-template>`__ -- `Create LLM <#create-llm>`__ - - - `Download model <#select-model>`__ - - `Select inference device for - LLM <#select-inference-device-for-llm>`__ - -- `Create agent <#create-agent>`__ -- `Run the agent <#run-agent>`__ -- `Interactive Demo <#interactive-demo>`__ - - - `Use built-in tool <#use-built-in-tool>`__ - - `Create customized tools <#create-customized-tools>`__ - - `Create AI agent demo with Gradio - UI <#create-ai-agent-demo-with-gradio-ui>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py") - open("cmd_helper.py", "w", encoding="utf-8").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("llm-agent-react-langchain.ipynb") - -.. code:: ipython3 - - import os - - os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - - %pip install -Uq pip - %pip uninstall -q -y optimum optimum-intel - %pip install --pre -Uq "openvino>=2024.5.0" openvino-tokenizers[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "transformers>=4.38.1" "langchain>=0.2.3" "langchain-huggingface>=0.1.2" "langchain-community>=0.2.4" "Wikipedia" \ - "torch>=2.1" \ - "datasets" \ - "accelerate" \ - "pydantic<2.10.0" \ - "gradio>=4.19" \ - "huggingface-hub>=0.26.5" - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" \ - "git+https://github.com/openvinotoolkit/nncf.git" - -Create a tools --------------- - - - -First, we need to create some tools to call. In this example, we will -create 3 custom functions to do basic calculation. For `more -information `__ on -creating custom tools. - -.. code:: ipython3 - - from langchain_core.tools import tool - - - @tool - def multiply(first_int: int, second_int: int) -> int: - """Multiply two integers together.""" - return first_int * second_int - - - @tool - def add(first_int: int, second_int: int) -> int: - "Add two integers." - return first_int + second_int - - - @tool - def exponentiate(base: int, exponent: int) -> int: - "Exponentiate the base to the exponent power." - return base**exponent - -.. code:: ipython3 - - print(f"name of `multiply` tool: {multiply.name}") - print(f"description of `multiply` tool: {multiply.description}") - - -.. parsed-literal:: - - name of `multiply` tool: multiply - description of `multiply` tool: Multiply two integers together. - - -Tools are interfaces that an agent, chain, or LLM can use to interact -with the world. They combine a few things: - -1. The name of the tool -2. A description of what the tool is -3. JSON schema of what the inputs to the tool are -4. The function to call -5. Whether the result of a tool should be returned directly to the user - -Now that we have created all of them, and we can create a list of tools -that we will use downstream. - -.. code:: ipython3 - - tools = [multiply, add, exponentiate] - -Create prompt template ----------------------- - - - -A prompt for a language model is a set of instructions or input provided -by a user to guide the model’s response, helping it understand the -context and generate relevant and coherent language-based output, such -as answering questions, completing sentences, or engaging in a -conversation. - -Different agents have different prompting styles for reasoning. In this -example, we will use `ReAct agent `__ with -its typical prompt template. For a full list of built-in agents see -`agent -types `__. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/91237924/a83bdf7f-bb9d-4b1f-9a0a-3fe4a76ba1ae - :alt: react - - react - -A ReAct prompt consists of few-shot task-solving trajectories, with -human-written text reasoning traces and actions, as well as environment -observations in response to actions. ReAct prompting is intuitive and -flexible to design, and achieves state-of-the-art few-shot performances -across a variety of tasks, from question answering to online shopping! - -In an prompt template for agent, ``input`` is user’s query and -``agent_scratchpad`` should be a sequence of messages that contains the -previous agent tool invocations and the corresponding tool outputs. - -.. code:: ipython3 - - PREFIX = """Respond to the human as helpfully and accurately as possible. You have access to the following tools:""" - - FORMAT_INSTRUCTIONS = """Use a json blob to specify a tool by providing an action key (tool name) and an action_input key (tool input). - - Valid "action" values: "Final Answer" or {tool_names} - - Provide only ONE action per $JSON_BLOB, as shown: - - ``` - {{{{ - "action": $TOOL_NAME, - "action_input": $INPUT - }}}} - ``` - - Follow this format: - - Question: input question to answer - Thought: consider previous and subsequent steps - Action: - ``` - $JSON_BLOB - ``` - Observation: action result - ... (repeat Thought/Action/Observation N times) - Thought: I know what to respond - Action: - ``` - {{{{ - "action": "Final Answer", - "action_input": "Final response to human" - }}}} - ```""" - - SUFFIX = """Begin! Reminder to ALWAYS respond with a valid json blob of a single action. Use tools if necessary. Respond directly if appropriate. Format is Action:```$JSON_BLOB```then Observation:. - Thought:""" - - HUMAN_MESSAGE_TEMPLATE = "{input}\n\n{agent_scratchpad}" - -Create LLM ----------- - - - -Large Language Models (LLMs) are a core component of LangChain. -LangChain does not serve its own LLMs, but rather provides a standard -interface for interacting with many different LLMs. In this example, we -select following models as LLM in agent pipeline. - -- **qwen2.5-3b-instruct/qwen2.5-7b-instruct/qwen2.5-14b-instruct** - - Qwen2.5 is the latest series of Qwen large language models. Comparing - with Qwen2, Qwen2.5 series brings significant improvements in coding, - mathematics and general knowledge skills. Additionally, it brings - long-context and multiple languages support including Chinese, - English, French, Spanish, Portuguese, German, Italian, Russian, - Japanese, Korean, Vietnamese, Thai, Arabic, and more. For more - details, please refer to - `model_card `__, - `blog `__, - `GitHub `__, and - `Documentation `__. -- **llama-3.1-8b-instruct** - The Llama 3.1 instruction tuned text only - models (8B, 70B, 405B) are optimized for multilingual dialogue use - cases and outperform many of the available open source and closed - chat models on common industry benchmarks. More details about model - can be found in `Meta blog - post `__, `model - website `__ and `model - card `__. - >\ **Note**: run model with demo, you will need to accept license - agreement. >You must be a registered user in Hugging Face Hub. - Please visit `HuggingFace model - card `__, - carefully read terms of usage and click accept button. You will need - to use an access token for the code below to run. For more - information on access tokens, refer to `this section of the - documentation `__. - >You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: python - - ## login to huggingfacehub to get access to pretrained model - - from huggingface_hub import notebook_login, whoami - - try: - whoami() - print('Authorization token already provided') - except OSError: - notebook_login() - -Download model -~~~~~~~~~~~~~~ - - - -To run LLM locally, we have to download the model in the first step. It -is possible to `export your -model `__ -to the OpenVINO IR format with the CLI, and load the model from local -folder. - -.. code:: ipython3 - - import ipywidgets as widgets - - llm_model_ids = ["Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen2.5-3B-Instruct", "Qwen/qwen2.5-14b-instruct", "meta-llama/Meta-Llama-3.1-8B-Instruct"] - - llm_model_id = widgets.Dropdown( - options=llm_model_ids, - value=llm_model_ids[0], - description="Model:", - disabled=False, - ) - - llm_model_id - - - - -.. parsed-literal:: - - Dropdown(description='Model:', options=('Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen2.5-3B-Instruct', 'Qwen/qwen2.5-… - - - -.. code:: ipython3 - - from cmd_helper import optimum_cli - - llm_model_path = llm_model_id.value.split("/")[-1] - repo_name = llm_model_id.value.split("/")[0] - - if not Path(llm_model_path).exists(): - optimum_cli( - llm_model_id.value, - llm_model_path, - additional_args={"task": "text-generation-with-past", "weight-format": "int4", "group-size": "128", "ratio": "1.0", "sym": ""}, - ) - - - -**Export command:** - - - -``optimum-cli export openvino --model Qwen/Qwen2.5-7B-Instruct Qwen2.5-7B-Instruct --task text-generation-with-past --weight-format int4 --group-size 128 --ratio 1.0 --sym`` - - -.. parsed-literal:: - - Downloading shards: 100%|██████████| 4/4 [06:03<00:00, 90.89s/it] - Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 6.20it/s] - We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache) - /home2/ethan/intel/openvino_notebooks/openvino_venv/lib/python3.10/site-packages/optimum/exporters/openvino/model_patcher.py:506: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if sequence_length != 1: - /home2/ethan/intel/openvino_notebooks/openvino_venv/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py:165: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if seq_len > self.max_seq_len_cached: - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Weight compression mode │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ int8_asym │ 14% (2 / 198) │ 0% (0 / 196) │ - ├───────────────────────────┼─────────────────────────────┼────────────────────────────────────────┤ - │ int4_sym │ 86% (196 / 198) │ 100% (196 / 196) │ - ┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 0:04:25 • 0:00:00;0;104;181m0:00:01181m0:00:11 - - -Select inference device for LLM -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget("CPU", exclude=["NPU"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'GPU', 'AUTO'), value='CPU') - - - -OpenVINO models can be run locally through the ``HuggingFacePipeline`` -class in LangChain. To deploy a model with OpenVINO, you can specify the -``backend="openvino"`` parameter to trigger OpenVINO as backend -inference framework. For `more -information `__. - -.. code:: ipython3 - - from langchain_huggingface import HuggingFacePipeline - from transformers.generation.stopping_criteria import StoppingCriteriaList, StoppingCriteria - - import openvino.properties as props - import openvino.properties.hint as hints - import openvino.properties.streams as streams - - import torch - - if hasattr(torch, "mps") and torch.mps.is_available: - torch.mps.is_available = lambda: False - - - class StopSequenceCriteria(StoppingCriteria): - """ - This class can be used to stop generation whenever a sequence of tokens is encountered. - - Args: - stop_sequences (`str` or `List[str]`): - The sequence (or list of sequences) on which to stop execution. - tokenizer: - The tokenizer used to decode the model outputs. - """ - - def __init__(self, stop_sequences, tokenizer): - if isinstance(stop_sequences, str): - stop_sequences = [stop_sequences] - self.stop_sequences = stop_sequences - self.tokenizer = tokenizer - - def __call__(self, input_ids, scores, **kwargs) -> bool: - decoded_output = self.tokenizer.decode(input_ids.tolist()[0]) - return any(decoded_output.endswith(stop_sequence) for stop_sequence in self.stop_sequences) - - - ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""} - stop_tokens = ["Observation:", "Observation:\n"] - - ov_llm = HuggingFacePipeline.from_model_id( - model_id=llm_model_path, - task="text-generation", - backend="openvino", - model_kwargs={ - "device": device.value, - "ov_config": ov_config, - "trust_remote_code": True, - }, - pipeline_kwargs={"max_new_tokens": 2048}, - ) - - tokenizer = ov_llm.pipeline.tokenizer - ov_llm.pipeline._forward_params["stopping_criteria"] = StoppingCriteriaList([StopSequenceCriteria(stop_tokens, tokenizer)]) - -.. code:: ipython3 - - from langchain_huggingface import ChatHuggingFace - - ov_chat = ChatHuggingFace(llm=ov_llm, verbose=True) - ov_chat = ov_chat.bind(skip_prompt=True, stop=["Observation:", "Observation:\n"]) - - if llm_model_id.value == "meta-llama/Meta-Llama-3.1-8B-Instruct": - ov_chat.llm.pipeline.tokenizer.pad_token_id = ov_chat.llm.pipeline.tokenizer.eos_token_id - -You can get additional inference speed improvement with `Dynamic -Quantization of activations and KV-cache quantization on -CPU `__. -These options can be enabled with ``ov_config`` as follows: - -.. code:: ipython3 - - ov_config = { - "KV_CACHE_PRECISION": "u8", - "DYNAMIC_QUANTIZATION_GROUP_SIZE": "32", - hints.performance_mode(): hints.PerformanceMode.LATENCY, - streams.num(): "1", - props.cache_dir(): "", - } - -Create agent ------------- - - - -Now that we have defined the tools, prompt template and LLM, we can -create the agent_executor. - -The agent executor is the runtime for an agent. This is what actually -calls the agent, executes the actions it chooses, passes the action -outputs back to the agent, and repeats. - -.. code:: ipython3 - - from langchain.agents import AgentExecutor, StructuredChatAgent - - agent = StructuredChatAgent.from_llm_and_tools( - ov_chat, - tools, - prefix=PREFIX, - suffix=SUFFIX, - human_message_template=HUMAN_MESSAGE_TEMPLATE, - format_instructions=FORMAT_INSTRUCTIONS, - ) - agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True) - -Run the agent -------------- - - - -We can now run the agent with a math query. Before getting the final -answer, a agent executor will also produce intermediate steps of -reasoning and actions. The format of these messages will follow your -prompt template. - -.. code:: ipython3 - - agent_executor.invoke({"input": "Take 3 to the fifth power and multiply that by the sum of twelve and three, then square the whole result"}) - - -.. parsed-literal:: - - - - > Entering new AgentExecutor chain... - Thought: First, I need to calculate 3 raised to the fifth power. Then, I will find the sum of twelve and three. After that, I will multiply the first result by the second result, and finally, I will square the entire result. - - Action: - ``` - { - "action": "exponentiate", - "action_input": { - "base": 3, - "exponent": 5 - } - } - ``` - Observation: - Observation: 243 - Thought:Next, I need to find the sum of twelve and three. - - Action: - ``` - { - "action": "add", - "action_input": { - "first_int": 12, - "second_int": 3 - } - } - ``` - Observation: - Observation: 15 - Thought:Now, I will multiply the result of \(3^5\) (which is 243) by the sum of twelve and three (which is 15). After that, I will square the entire result. - - Action: - ``` - { - "action": "multiply", - "action_input": { - "first_int": 243, - "second_int": 15 - } - } - ``` - Observation: - Observation: 3645 - Thought:Thought: Now I need to square the result of the multiplication, which is 3645. - - Action: - ``` - { - "action": "exponentiate", - "action_input": { - "base": 3645, - "exponent": 2 - } - } - ``` - Observation: - Observation: 13286025 - Thought:Thought: I know what to respond - - Action: - ``` - { - "action": "Final Answer", - "action_input": "The final result is 13286025." - } - ``` - - > Finished chain. - - - - -.. parsed-literal:: - - {'input': 'Take 3 to the fifth power and multiply that by the sum of twelve and three, then square the whole result', - 'output': 'The final result is 13286025.'} - - - -Interactive Demo ----------------- - - - -Let’s create a interactive agent using -`Gradio `__. - -Use built-in tools -~~~~~~~~~~~~~~~~~~ - - - -LangChain has provided a list of all `built-in -tools `__. In -this example, we will use ``Wikipedia`` python package to query key -words generated by agent. - -.. code:: ipython3 - - from langchain_community.tools import WikipediaQueryRun - from langchain_community.utilities import WikipediaAPIWrapper - from langchain_core.callbacks import CallbackManagerForToolRun - from typing import Optional - - from pydantic import BaseModel, Field - - - class WikipediaQueryRunWrapper(WikipediaQueryRun): - def _run( - self, - text: str, - run_manager: Optional[CallbackManagerForToolRun] = None, - ) -> str: - """Use the Wikipedia tool.""" - return self.api_wrapper.run(text) - - - api_wrapper = WikipediaAPIWrapper(top_k_results=2, doc_content_chars_max=1000) - - - class WikiInputs(BaseModel): - """inputs to the wikipedia tool.""" - - text: str = Field(description="query to look up on wikipedia.") - - - wikipedia = WikipediaQueryRunWrapper( - description="A wrapper around Wikipedia. Useful for when you need to answer general questions about people, places, companies, facts, historical events, or other subjects. Input should be a search query.", - args_schema=WikiInputs, - api_wrapper=api_wrapper, - ) - -.. code:: ipython3 - - wikipedia.invoke({"text": "OpenVINO"}) - - - - -.. parsed-literal:: - - 'Page: OpenVINO\nSummary: OpenVINO is an open-source software toolkit for optimizing and deploying deep learning models. It enables programmers to develop scalable and efficient AI solutions with relatively few lines of code. It supports several popular model formats and categories, such as large language models, computer vision, and generative AI.\nActively developed by Intel, it prioritizes high-performance inference on Intel hardware but also supports ARM/ARM64 processors and encourages contributors to add new devices to the portfolio.\nBased in C++, it offers the following APIs: C/C++, Python, and Node.js (an early preview).\nOpenVINO is cross-platform and free for use under Apache License 2.0.\n\nPage: Audacity (audio editor)\nSummary: Audacity is a free and open-source digital audio editor and recording application software, available for Windows, macOS, Linux, and other Unix-like operating systems. \nAs of December 6, 2022, Audacity is the most popular download at FossHub, with over 114.' - - - -Create customized tools -~~~~~~~~~~~~~~~~~~~~~~~ - - - -In this examples, we will create 2 customized tools for -``image generation`` and ``weather qurey``. - -.. code:: ipython3 - - import urllib.parse - import json5 - - - @tool - def painting(prompt: str) -> str: - """ - AI painting (image generation) service, input text description, and return the image URL drawn based on text information. - """ - prompt = urllib.parse.quote(prompt) - return json5.dumps({"image_url": f"https://image.pollinations.ai/prompt/{prompt}"}, ensure_ascii=False) - - - painting.invoke({"prompt": "a cat"}) - - - - -.. parsed-literal:: - - '{image_url: "https://image.pollinations.ai/prompt/a%20cat"}' - - - -.. code:: ipython3 - - @tool - def weather( - city_name: str, - ) -> str: - """ - Get the current weather for `city_name` - """ - - if not isinstance(city_name, str): - raise TypeError("City name must be a string") - - key_selection = { - "current_condition": [ - "temp_C", - "FeelsLikeC", - "humidity", - "weatherDesc", - "observation_time", - ], - } - import requests - - resp = requests.get(f"https://wttr.in/{city_name}?format=j1") - resp.raise_for_status() - resp = resp.json() - ret = {k: {_v: resp[k][0][_v] for _v in v} for k, v in key_selection.items()} - - return str(ret) - - - weather.invoke({"city_name": "London"}) - - - - -.. parsed-literal:: - - "{'current_condition': {'temp_C': '0', 'FeelsLikeC': '-4', 'humidity': '86', 'weatherDesc': [{'value': 'Clear'}], 'observation_time': '12:16 AM'}}" - - - -Create AI agent demo with Gradio UI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - tools = [wikipedia, painting, weather] - - agent = StructuredChatAgent.from_llm_and_tools( - ov_chat, - tools, - prefix=PREFIX, - suffix=SUFFIX, - human_message_template=HUMAN_MESSAGE_TEMPLATE, - format_instructions=FORMAT_INSTRUCTIONS, - ) - agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True) - -.. code:: ipython3 - - def partial_text_processor(partial_text, new_text): - """ - helper for updating partially generated answer, used by default - - Params: - partial_text: text buffer for storing previosly generated text - new_text: text update for the current step - Returns: - updated text string - - """ - partial_text += new_text - return partial_text - - - def run_chatbot(history): - """ - callback function for running chatbot on submit button click - - Params: - history: conversation history - - """ - partial_text = "" - - for new_text in agent_executor.stream( - {"input": history[-1][0]}, - ): - if "output" in new_text.keys(): - partial_text = partial_text_processor(partial_text, new_text["output"]) - history[-1][1] = partial_text - yield history - - - def request_cancel(): - ov_chat.llm.pipeline.model.request.cancel() - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llm-agent-react/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - examples = [ - ["Based on current weather in London, show me a picture of Big Ben through its URL"], - ["What is OpenVINO ?"], - ["Create an image of pink cat and return its URL"], - ["How many people live in Canada ?"], - ["What is the weather like in New York now ?"], - ] - - demo = make_demo(run_fn=run_chatbot, stop_fn=request_cancel, examples=examples) - - try: - demo.launch() - except Exception: - demo.launch(share=True) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ - -.. code:: ipython3 - - # please uncomment and run this cell for stopping gradio interface - # demo.close() diff --git a/docs/notebooks/llm-agent-react-with-output.rst b/docs/notebooks/llm-agent-react-with-output.rst deleted file mode 100644 index ae85610c1f500c..00000000000000 --- a/docs/notebooks/llm-agent-react-with-output.rst +++ /dev/null @@ -1,687 +0,0 @@ -Create a native Agent with OpenVINO -=================================== - -LLM are limited to the knowledge on which they have been trained and the -additional knowledge provided as context, as a result, if a useful piece -of information is missing the provided knowledge, the model cannot “go -around” and try to find it in other sources. This is the reason why we -need to introduce the concept of Agents. - -The core idea of agents is to use a language model to choose a sequence -of actions to take. In agents, a language model is used as a reasoning -engine to determine which actions to take and in which order. Agents can -be seen as applications powered by LLMs and integrated with a set of -tools like search engines, databases, websites, and so on. Within an -agent, the LLM is the reasoning engine that, based on the user input, is -able to plan and execute a set of actions that are needed to fulfill the -request. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/91237924/22fa5396-8381-400f-a78f-97e25d57d807 - :alt: agent - - agent - -This example will demonstrate how to create a native agent with -OpenVINO. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Create LLM as agent <#create-llm-as-agent>`__ - - - `Download model <#select-model>`__ - - `Select inference device for - LLM <#select-inference-device-for-llm>`__ - - `Instantiate LLM using Optimum - Intel <#instantiate-llm-using-optimum-intel>`__ - - `Create text generation method <#create-text-generation-method>`__ - -- `Create prompt template <#create-prompt-template>`__ -- `Create parser <#create-parers>`__ -- `Create tools calling <#create-tool-calling>`__ -- `Run agent <#run-agent>`__ -- `Create AI agent demo with Gradio - UI <#create-ai-agent-demo-with-gradio-ui>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import os - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("pip_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/pip_helper.py", - ) - open("pip_helper.py", "w").write(r.text) - - os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - - from pip_helper import pip_install - - pip_install( - "-q", - "--extra-index-url", - "https://download.pytorch.org/whl/cpu", - "transformers>=4.43.1", - "gradio>=4.19", - ) - pip_install( - "-q", - "git+https://github.com/huggingface/optimum-intel.git", - "git+https://github.com/openvinotoolkit/nncf.git", - "datasets", - "accelerate", - "huggingface-hub>=0.26.5", - ) - pip_install("--pre", "-Uq", "openvino>=2024.4.0", "--extra-index-url", "https://storage.openvinotoolkit.org/simple/wheels/nightly") - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("llm-agent-react.ipynb") - -Create LLM as agent -------------------- - - - -Download LLM -~~~~~~~~~~~~ - - - -To run LLM locally, we have to download the model in the first step. It -is possible to `export your -model `__ -to the OpenVINO IR format with the CLI, and load the model from local -folder. - -Large Language Models (LLMs) are a core component of agent. LlamaIndex -does not serve its own LLMs, but rather provides a standard interface -for interacting with many different LLMs. In this example, we can select -``Qwen2.5`` as LLM in agent pipeline. - -* **qwen2.5-3b-instruct/qwen2.5-7b-instruct/qwen2.5-14b-instruct** - - Qwen2.5 is the latest series of Qwen large language models. Comparing - with Qwen2, Qwen2.5 series brings significant improvements in coding, - mathematics and general knowledge skills. Additionally, it brings - long-context and multiple languages support including Chinese, English, - French, Spanish, Portuguese, German, Italian, Russian, Japanese, Korean, - Vietnamese, Thai, Arabic, and more. For more details, please refer to - `model_card `__, - `blog `__, - `GitHub `__, and - `Documentation `__. - -.. code:: ipython3 - - import ipywidgets as widgets - - llm_model_ids = ["Qwen/Qwen2.5-3B-Instruct", "Qwen/Qwen2.5-7B-Instruct", "Qwen/qwen2.5-14b-instruct"] - - llm_model_id = widgets.Dropdown( - options=llm_model_ids, - value=llm_model_ids[0], - description="Model:", - disabled=False, - ) - - llm_model_id - - - - -.. parsed-literal:: - - Dropdown(description='Model:', options=('Qwen/Qwen2.5-3B-Instruct', 'Qwen/Qwen2.5-7B-Instruct', 'Qwen/qwen2.5-… - - - -.. code:: ipython3 - - from pathlib import Path - - llm_model_path = llm_model_id.value.split("/")[-1] - - if not Path(llm_model_path).exists(): - !optimum-cli export openvino --model {llm_model_id.value} --task text-generation-with-past --trust-remote-code --weight-format int4 --group-size 128 --ratio 1.0 --sym {llm_model_path} - -Select inference device for LLM -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - llm_device = device_widget("CPU", exclude=["NPU"]) - - llm_device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'GPU', 'AUTO'), value='CPU') - - - -Instantiate LLM using Optimum Intel ------------------------------------ - - - -Optimum Intel can be used to load optimized models from the `Hugging -Face Hub `__ and -create pipelines to run an inference with OpenVINO Runtime using Hugging -Face APIs. The Optimum Inference models are API compatible with Hugging -Face Transformers models. This means we just need to replace -``AutoModelForXxx`` class with the corresponding ``OVModelForXxx`` -class. - -Below is an example of the RedPajama model - -.. code:: diff - - -from transformers import AutoModelForCausalLM - +from optimum.intel.openvino import OVModelForCausalLM - from transformers import AutoTokenizer, pipeline - - model_id = "togethercomputer/RedPajama-INCITE-Chat-3B-v1" - -model = AutoModelForCausalLM.from_pretrained(model_id) - +model = OVModelForCausalLM.from_pretrained(model_id, export=True) - -Model class initialization starts with calling ``from_pretrained`` -method. When downloading and converting Transformers model, the -parameter ``export=True`` should be added (as we already converted model -before, we do not need to provide this parameter). We can save the -converted model for the next usage with the ``save_pretrained`` method. -Tokenizer class and pipelines API are compatible with Optimum models. - -You can find more details about OpenVINO LLM inference using HuggingFace -Optimum API in `LLM inference -guide `__. - -.. code:: ipython3 - - from optimum.intel.openvino import OVModelForCausalLM - from transformers import AutoTokenizer, AutoConfig, TextStreamer - from transformers.generation import ( - StoppingCriteriaList, - StoppingCriteria, - ) - import openvino.properties as props - import openvino.properties.hint as hints - import openvino.properties.streams as streams - - import json - import json5 - import torch - - tokenizer = AutoTokenizer.from_pretrained(llm_model_path, trust_remote_code=True) - - ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""} - - llm = OVModelForCausalLM.from_pretrained( - llm_model_path, - device=llm_device.value, - ov_config=ov_config, - config=AutoConfig.from_pretrained(llm_model_path, trust_remote_code=True), - trust_remote_code=True, - ) - - llm.generation_config.top_k = 1 - llm.generation_config.max_length = 2000 - -Create text generation method -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -In this example, we would like to stream the output text though -``TextStreamer``, and stop text generation before ``Observation`` -received from tool calling.. - -.. code:: ipython3 - - class StopSequenceCriteria(StoppingCriteria): - """ - This class can be used to stop generation whenever a sequence of tokens is encountered. - - Args: - stop_sequences (`str` or `List[str]`): - The sequence (or list of sequences) on which to stop execution. - tokenizer: - The tokenizer used to decode the model outputs. - """ - - def __init__(self, stop_sequences, tokenizer): - if isinstance(stop_sequences, str): - stop_sequences = [stop_sequences] - self.stop_sequences = stop_sequences - self.tokenizer = tokenizer - - def __call__(self, input_ids, scores, **kwargs) -> bool: - decoded_output = self.tokenizer.decode(input_ids.tolist()[0]) - return any(decoded_output.endswith(stop_sequence) for stop_sequence in self.stop_sequences) - - - def text_completion(prompt: str, stop_words) -> str: - im_end = "<|im_end|>" - if im_end not in stop_words: - stop_words = stop_words + [im_end] - streamer = TextStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True) - - stopping_criteria = StoppingCriteriaList([StopSequenceCriteria(stop_words, tokenizer)]) - input_ids = torch.tensor([tokenizer.encode(prompt)]) - generate_kwargs = dict( - input_ids=input_ids, - streamer=streamer, - stopping_criteria=stopping_criteria, - ) - output = llm.generate(**generate_kwargs) - output = output.tolist()[0] - output = tokenizer.decode(output, errors="ignore") - assert output.startswith(prompt) - output = output[len(prompt) :].replace("<|endoftext|>", "").replace(im_end, "") - - for stop_str in stop_words: - idx = output.find(stop_str) - if idx != -1: - output = output[: idx + len(stop_str)] - return output - -Create prompt template ----------------------- - - - -A prompt for a language model is a set of instructions or input provided -by a user to guide the model’s response, helping it understand the -context and generate relevant and coherent language-based output, such -as answering questions, completing sentences, or engaging in a -conversation. - -Different agents have different prompting styles for reasoning. In this -example, we will use `ReAct agent `__ with -its typical prompt template. For a full list of built-in agents see -`agent -types `__. - -.. figure:: https://github.com/user-attachments/assets/c26432c2-3cf1-4942-ae03-fd8e8ebb4509 - :alt: react - - react - -A ReAct prompt consists of few-shot task-solving trajectories, with -human-written text reasoning traces and actions, as well as environment -observations in response to actions. ReAct prompting is intuitive and -flexible to design, and achieves state-of-the-art few-shot performances -across a variety of tasks, from question answering to online shopping! - -In an prompt template for agent, ``query`` is user’s query and other -parameter should be a sequence of messages that contains the -``descriptions`` and ``parameters`` of agent tool. - -.. code:: ipython3 - - TOOL_DESC = """{name_for_model}: Call this tool to interact with the {name_for_human} API. What is the {name_for_human} API useful for? {description_for_model} Parameters: {parameters}""" - - PROMPT_REACT = """Answer the following questions as best you can. You have access to the following APIs: - - {tools_text} - - Use the following format: - - Question: the input question you must answer - Thought: you should always think about what to do - Action: the action to take, should be one of [{tools_name_text}] - Action Input: the input to the action - Observation: the result of the action - ... (this Thought/Action/Action Input/Observation can be repeated zero or more times) - Thought: I now know the final answer - Final Answer: the final answer to the original input question - - Begin! - - Question: {query}""" - -Meanwhile we have to create function for consolidate the tools -information and conversation history into the prompt template. - -.. code:: ipython3 - - def build_input_text(chat_history, list_of_tool_info) -> str: - tools_text = [] - for tool_info in list_of_tool_info: - tool = TOOL_DESC.format( - name_for_model=tool_info["name_for_model"], - name_for_human=tool_info["name_for_human"], - description_for_model=tool_info["description_for_model"], - parameters=json.dumps(tool_info["parameters"], ensure_ascii=False), - ) - if tool_info.get("args_format", "json") == "json": - tool += " Format the arguments as a JSON object." - elif tool_info["args_format"] == "code": - tool += " Enclose the code within triple backticks (`) at the beginning and end of the code." - else: - raise NotImplementedError - tools_text.append(tool) - tools_text = "\n\n".join(tools_text) - - tools_name_text = ", ".join([tool_info["name_for_model"] for tool_info in list_of_tool_info]) - - messages = [{"role": "system", "content": "You are a helpful assistant."}] - for i, (query, response) in enumerate(chat_history): - if list_of_tool_info: - if (len(chat_history) == 1) or (i == len(chat_history) - 2): - query = PROMPT_REACT.format( - tools_text=tools_text, - tools_name_text=tools_name_text, - query=query, - ) - if query: - messages.append({"role": "user", "content": query}) - if response: - messages.append({"role": "assistant", "content": response}) - - prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, return_tensors="pt") - - return prompt - -Create parser -------------- - - - -A Parser is used to convert raw output of LLM to the input arguments of -tools. - -.. code:: ipython3 - - def parse_latest_tool_call(text): - tool_name, tool_args = "", "" - i = text.rfind("\nAction:") - j = text.rfind("\nAction Input:") - k = text.rfind("\nObservation:") - if 0 <= i < j: # If the text has `Action` and `Action input`, - if k < j: # but does not contain `Observation`, - # then it is likely that `Observation` is ommited by the LLM, - # because the output text may have discarded the stop word. - text = text.rstrip() + "\nObservation:" # Add it back. - k = text.rfind("\nObservation:") - tool_name = text[i + len("\nAction:") : j].strip() - tool_args = text[j + len("\nAction Input:") : k].strip() - text = text[:k] - return tool_name, tool_args, text - -Create tools calling --------------------- - - - -In this examples, we will create 2 customized tools for -``image generation`` and ``weather qurey``. A detailed description of -these tools should be defined in json format, which will be used as part -of prompt. - -.. code:: ipython3 - - tools = [ - { - "name_for_human": "get weather", - "name_for_model": "get_weather", - "description_for_model": 'Get the current weather in a given city name."', - "parameters": [ - { - "name": "city_name", - "description": "City name", - "required": True, - "schema": {"type": "string"}, - } - ], - }, - { - "name_for_human": "image generation", - "name_for_model": "image_gen", - "description_for_model": "AI painting (image generation) service, input text description, and return the image URL drawn based on text information.", - "parameters": [ - { - "name": "prompt", - "description": "describe the image", - "required": True, - "schema": {"type": "string"}, - } - ], - }, - ] - -Then we should implement these tools with inputs and outputs, and -execute them according to the output of LLM. - -.. code:: ipython3 - - def call_tool(tool_name: str, tool_args: str) -> str: - if tool_name == "get_weather": - city_name = json5.loads(tool_args)["city_name"] - key_selection = { - "current_condition": [ - "temp_C", - "FeelsLikeC", - "humidity", - "weatherDesc", - "observation_time", - ], - } - resp = requests.get(f"https://wttr.in/{city_name}?format=j1") - resp.raise_for_status() - resp = resp.json() - ret = {k: {_v: resp[k][0][_v] for _v in v} for k, v in key_selection.items()} - return str(ret) - elif tool_name == "image_gen": - import urllib.parse - - tool_args = tool_args.replace("(", "").replace(")", "") - prompt = json5.loads(tool_args)["prompt"] - prompt = urllib.parse.quote(prompt) - return json.dumps( - {"image_url": f"https://image.pollinations.ai/prompt/{prompt}"}, - ensure_ascii=False, - ) - else: - raise NotImplementedError - - - def llm_with_tool(prompt: str, history, list_of_tool_info=()): - chat_history = [(x["user"], x["bot"]) for x in history] + [(prompt, "")] - - planning_prompt = build_input_text(chat_history, list_of_tool_info) - text = "" - while True: - output = text_completion(planning_prompt + text, stop_words=["Observation:", "Observation:\n"]) - action, action_input, output = parse_latest_tool_call(output) - if action: - observation = call_tool(action, action_input) - output += f"\nObservation: = {observation}\nThought:" - observation = f"{observation}\nThought:" - print(observation) - text += output - else: - text += output - break - - new_history = [] - new_history.extend(history) - new_history.append({"user": prompt, "bot": text}) - return text, new_history - -Run agent ---------- - - - -.. code:: ipython3 - - history = [] - query = "get the weather in London, and create a picture of Big Ben based on the weather information" - - response, history = llm_with_tool(prompt=query, history=history, list_of_tool_info=tools) - - -.. parsed-literal:: - - The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. - - -.. parsed-literal:: - - Thought: First, I need to use the get_weather API to get the current weather in London. - Action: get_weather - Action Input: {"city_name": "London"} - Observation: - {'current_condition': {'temp_C': '6', 'FeelsLikeC': '3', 'humidity': '75', 'weatherDesc': [{'value': 'Overcast'}], 'observation_time': '12:32 AM'}} - Thought: - Now that I have the weather information, I will use the image_gen API to generate an image of Big Ben based on the weather conditions. - Action: image_gen - Action Input: {"prompt": "Big Ben under an overcast sky with a temperature of 6 degrees Celsius and humidity of 75%"} - Observation: - {"image_url": "https://image.pollinations.ai/prompt/Big%20Ben%20under%20an%20overcast%20sky%20with%20a%20temperature%20of%206%20degrees%20Celsius%20and%20humidity%20of%2075%25"} - Thought: - I now know the final answer. - Final Answer: The current weather in London is overcast with a temperature of 6 degrees Celsius and humidity of 75%. Based on this information, here is the generated image of Big Ben under an overcast sky: ![](https://image.pollinations.ai/prompt/Big%20Ben%20under%20an%20overcast%20sky%20with%20a%20temperature%20of%206%20degrees%20Celsius%20and%20humidity%20of%2075%25). - - -Create AI agent demo with Gradio UI ------------------------------------ - - - -.. code:: ipython3 - - from transformers import TextIteratorStreamer - from threading import Thread - - - def run_chatbot(history): - """ - callback function for running chatbot on submit button click - - Params: - history: conversation history - - """ - chat_history = [(history[-1][0], "")] - - prompt = build_input_text(chat_history, tools) - text = "" - while True: - planning_prompt = prompt + text - im_end = "<|im_end|>" - stop_words = ["Observation:", "Observation:\n"] - if im_end not in stop_words: - stop_words = stop_words + [im_end] - streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True) - - stopping_criteria = StoppingCriteriaList([StopSequenceCriteria(stop_words, tokenizer)]) - input_ids = torch.tensor([tokenizer.encode(planning_prompt)]) - generate_kwargs = dict( - input_ids=input_ids, - streamer=streamer, - stopping_criteria=stopping_criteria, - ) - - thread = Thread(target=llm.generate, kwargs=generate_kwargs) - thread.start() - output = "" - output_gui = "" - show_response = False - for new_text in streamer: - output += new_text - if "Final" in new_text: - show_response = True - if show_response: - output_gui += new_text - history[-1][1] = output_gui - yield history - - # assert buffer.startswith(prompt) - for stop_str in stop_words: - idx = output.find(stop_str) - if idx != -1: - output = output[: idx + len(stop_str)] - print(output) - action, action_input, output = parse_latest_tool_call(output) - if action: - observation = call_tool(action, action_input) - output += f"\nObservation: = {observation}\nThought:" - observation = f"{observation}\nThought:" - print(observation) - text += output - else: - text += output - break - - - def request_cancel(): - llm.request.cancel() - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llm-agent-react/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - examples = [ - ["Based on current weather in Beijing, show me a picture of Great Wall through its URL"], - ["Create an image of pink cat and return its URL"], - ["What is the weather like in New York now ?"], - ] - - demo = make_demo(run_fn=run_chatbot, stop_fn=request_cancel, examples=examples) - - try: - demo.launch() - except Exception: - demo.launch(share=True) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ - -.. code:: ipython3 - - # please uncomment and run this cell for stopping gradio interface - # demo.close() - - -.. parsed-literal:: - - Closing server running on port: 5612 - diff --git a/docs/notebooks/llm-chatbot-generate-api-with-output.rst b/docs/notebooks/llm-chatbot-generate-api-with-output.rst deleted file mode 100644 index ae7eb175c006e4..00000000000000 --- a/docs/notebooks/llm-chatbot-generate-api-with-output.rst +++ /dev/null @@ -1,1112 +0,0 @@ -Create an LLM-powered Chatbot using OpenVINO Generate API -========================================================= - -In the rapidly evolving world of artificial intelligence (AI), chatbots -have emerged as powerful tools for businesses to enhance customer -interactions and streamline operations. Large Language Models (LLMs) are -artificial intelligence systems that can understand and generate human -language. They use deep learning algorithms and massive amounts of data -to learn the nuances of language and produce coherent and relevant -responses. While a decent intent-based chatbot can answer basic, -one-touch inquiries like order management, FAQs, and policy questions, -LLM chatbots can tackle more complex, multi-touch questions. LLM enables -chatbots to provide support in a conversational manner, similar to how -humans do, through contextual memory. Leveraging the capabilities of -Language Models, chatbots are becoming increasingly intelligent, capable -of understanding and responding to human language with remarkable -accuracy. - -Previously, we already discussed how to build an instruction-following -pipeline using OpenVINO, please check out `this -tutorial `__ for -reference. In this tutorial, we consider how to use the power of -OpenVINO for running Large Language Models for chat. We will use a -pre-trained model from the `Hugging Face -Transformers `__ -library. The `Hugging Face Optimum -Intel `__ library -converts the models to OpenVINO™ IR format. To simplify the user -experience, we will use `OpenVINO Generate -API `__ for -generation pipeline. - -The tutorial consists of the following steps: - -- Install prerequisites -- Download and convert the model from a public source using the - `OpenVINO integration with Hugging Face - Optimum `__. -- Compress model weights to 4-bit or 8-bit data types using - `NNCF `__ -- Create a chat inference pipeline with `OpenVINO Generate - API `__. -- Run chat pipeline with `Gradio `__. - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Select model for inference <#select-model-for-inference>`__ -- `Convert model using Optimum-CLI - tool <#convert-model-using-optimum-cli-tool>`__ - - - `Weights Compression using - Optimum-CLI <#weights-compression-using-optimum-cli>`__ - -- `Select device for inference <#select-device-for-inference>`__ -- `Instantiate pipeline with OpenVINO Generate - API <#instantiate-pipeline-with-openvino-generate-api>`__ -- `Run Chatbot <#run-chatbot>`__ - - - `Advanced generation options <#advanced-generation-options>`__ - -Prerequisites -------------- - - - -Install required dependencies - -.. code:: ipython3 - - import os - import platform - - os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - - %pip install -q -U "openvino>=2024.6.0" openvino-tokenizers[transformers] openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu\ - "git+https://github.com/huggingface/optimum-intel.git"\ - "nncf==2.14.1"\ - "torch>=2.1"\ - "datasets" \ - "accelerate" \ - "gradio>=4.19" \ - "transformers>=4.43.1" \ - "huggingface-hub>=0.26.5" \ - "einops" "transformers_stream_generator" "tiktoken" "bitsandbytes" - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - -.. code:: ipython3 - - import os - from pathlib import Path - import requests - import shutil - - # fetch model configuration - - config_shared_path = Path("../../utils/llm_config.py") - config_dst_path = Path("llm_config.py") - - if not config_dst_path.exists(): - if config_shared_path.exists(): - try: - os.symlink(config_shared_path, config_dst_path) - except Exception: - shutil.copy(config_shared_path, config_dst_path) - else: - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py") - with open("llm_config.py", "w", encoding="utf-8") as f: - f.write(r.text) - elif not os.path.islink(config_dst_path): - print("LLM config will be updated") - if config_shared_path.exists(): - shutil.copy(config_shared_path, config_dst_path) - else: - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py") - with open("llm_config.py", "w", encoding="utf-8") as f: - f.write(r.text) - - if not Path("notebook_utils.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("llm-chatbot-generate-api.ipynb") - -Select device for inference ---------------------------- - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget(default="CPU") - - device - - -.. parsed-literal:: - - :247: DeprecationWarning: The `openvino.runtime` module is deprecated and will be removed in the 2026.0 release. Please replace `openvino.runtime` with `openvino`. - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -Select model for inference --------------------------- - - - -The tutorial supports different models, you can select one from the -provided options to compare the quality of open source LLM solutions. -Model conversion and optimization is time- and memory-consuming process. -For your convenience, we provide a -`collection `__ -of optimized models on HuggingFace hub. You can skip the model -conversion step by selecting one of the available on HuggingFace hub -model. If you want to reproduce optimization process locally, please -unset **Use preconverted models** checkbox. - - **Note**: conversion of some models can require additional actions - from user side and at least 64GB RAM for conversion. - -`Weight -compression `__ -is a technique for enhancing the efficiency of models, especially those -with large memory requirements. This method reduces the model’s memory -footprint, a crucial factor for Large Language Models (LLMs). We provide -several options for model weight compression: - -- **FP16** reducing model binary size on disk using ``save_model`` with - enabled compression weights to FP16 precision. This approach is - available in OpenVINO from scratch and is the default behavior. -- **INT8** is an 8-bit weight-only quantization provided by - `NNCF `__: This method - compresses weights to an 8-bit integer data type, which balances - model size reduction and accuracy, making it a versatile option for a - broad range of applications. -- **INT4** is an 4-bit weight-only quantization provided by - `NNCF `__. involves - quantizing weights to an unsigned 4-bit integer symmetrically around - a fixed zero point of eight (i.e., the midpoint between zero and 15). - in case of **symmetric quantization** or asymmetrically with a - non-fixed zero point, in case of **asymmetric quantization** - respectively. Compared to INT8 compression, INT4 compression improves - performance even more, but introduces a minor drop in prediction - quality. INT4 it ideal for situations where speed is prioritized over - an acceptable trade-off against accuracy. -- **INT4 AWQ** is an 4-bit activation-aware weight quantization. - `Activation-aware Weight - Quantization `__ (AWQ) is an - algorithm that tunes model weights for more accurate INT4 - compression. It slightly improves generation quality of compressed - LLMs, but requires significant additional time for tuning weights on - a calibration dataset. We will use ``wikitext-2-raw-v1/train`` subset - of the - `Wikitext `__ - dataset for calibration. -- **INT4 NPU-friendly** is an 4-bit channel-wise quantization. This - approach is - `recommended `__ - for LLM inference using NPU. - -.. raw:: html - -
- -.. raw:: html - - - -Click here to see available models options - -.. raw:: html - - - -- **tiny-llama-1b-chat** - This is the chat model finetuned on top of - `TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T `__. - The TinyLlama project aims to pretrain a 1.1B Llama model on 3 - trillion tokens with the adoption of the same architecture and - tokenizer as Llama 2. This means TinyLlama can be plugged and played - in many open-source projects built upon Llama. Besides, TinyLlama is - compact with only 1.1B parameters. This compactness allows it to - cater to a multitude of applications demanding a restricted - computation and memory footprint. More details about model can be - found in `model - card `__ -- **minicpm-2b-dpo** - MiniCPM is an End-Size LLM developed by - ModelBest Inc. and TsinghuaNLP, with only 2.4B parameters excluding - embeddings. After Direct Preference Optimization (DPO) fine-tuning, - MiniCPM outperforms many popular 7b, 13b and 70b models. More details - can be found in - `model_card `__. -- **minicpm3-4b** - MiniCPM3-4B is the 3rd generation of MiniCPM - series. The overall performance of MiniCPM3-4B surpasses - Phi-3.5-mini-Instruct, being comparable with many recent 7B~9B - models.Compared to previous generations, MiniCPM3-4B has a more - powerful and versatile skill set to enable more general usage. More - details can be found in `model - card `__. -- **llama-3.2-1B-instruct** - 1B parameters model from LLama3.2 - collection of instruction-tuned multilingual models. Llama 3.2 - instruction-tuned text only models are optimized for multilingual - dialogue use cases, including agentic retrieval and summarization - tasks. They outperform many of the available open source and closed - chat models on common industry benchmarks. More details can be found - in `model - card `__ - >\ **Note**: run model with demo, you will need to accept license - agreement. >You must be a registered user in Hugging Face Hub. - Please visit `HuggingFace model - card `__, - carefully read terms of usage and click accept button. You will need - to use an access token for the code below to run. For more - information on access tokens, refer to `this section of the - documentation `__. - >You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: python - - # login to huggingfacehub to get access to pretrained model - - - from huggingface_hub import notebook_login, whoami - - try: - whoami() - print('Authorization token already provided') - except OSError: - notebook_login() - -- **llama-3.2-3B-instruct** - 3B parameters model from LLama3.2 - collection of instruction-tuned multilingual models. Llama 3.2 - instruction-tuned text only models are optimized for multilingual - dialogue use cases, including agentic retrieval and summarization - tasks. They outperform many of the available open source and closed - chat models on common industry benchmarks. More details can be found - in `model - card `__ - >\ **Note**: run model with demo, you will need to accept license - agreement. >You must be a registered user in Hugging Face Hub. - Please visit `HuggingFace model - card `__, - carefully read terms of usage and click accept button. You will need - to use an access token for the code below to run. For more - information on access tokens, refer to `this section of the - documentation `__. - >You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: python - - # login to huggingfacehub to get access to pretrained model - - - from huggingface_hub import notebook_login, whoami - - try: - whoami() - print('Authorization token already provided') - except OSError: - notebook_login() - -- **gemma-2b-it** - Gemma is a family of lightweight, state-of-the-art - open models from Google, built from the same research and technology - used to create the Gemini models. They are text-to-text, decoder-only - large language models, available in English, with open weights, - pre-trained variants, and instruction-tuned variants. Gemma models - are well-suited for a variety of text generation tasks, including - question answering, summarization, and reasoning. This model is - instruction-tuned version of 2B parameters model. More details about - model can be found in `model - card `__. >\ **Note**: run - model with demo, you will need to accept license agreement. >You must - be a registered user in Hugging Face Hub. Please visit - `HuggingFace model - card `__, carefully read - terms of usage and click accept button. You will need to use an - access token for the code below to run. For more information on - access tokens, refer to `this section of the - documentation `__. - >You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: python - - # login to huggingfacehub to get access to pretrained model - - - from huggingface_hub import notebook_login, whoami - - try: - whoami() - print('Authorization token already provided') - except OSError: - notebook_login() - -- **gemma-2-2b-it** - Gemma2 is the second generation of a Gemma family - of lightweight, state-of-the-art open models from Google, built from - the same research and technology used to create the Gemini models. - They are text-to-text, decoder-only large language models, available - in English, with open weights, pre-trained variants, and - instruction-tuned variants. Gemma models are well-suited for a - variety of text generation tasks, including question answering, - summarization, and reasoning. This model is instruction-tuned version - of 2B parameters model. More details about model can be found in - `model card `__. - >\ **Note**: run model with demo, you will need to accept license - agreement. >You must be a registered user in Hugging Face Hub. - Please visit `HuggingFace model - card `__, carefully read - terms of usage and click accept button. You will need to use an - access token for the code below to run. For more information on - access tokens, refer to `this section of the - documentation `__. - >You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: python - - # login to huggingfacehub to get access to pretrained model - - - from huggingface_hub import notebook_login, whoami - - try: - whoami() - print('Authorization token already provided') - except OSError: - notebook_login() - -- **phi-3-mini-instruct** - The Phi-3-Mini is a 3.8B parameters, - lightweight, state-of-the-art open model trained with the Phi-3 - datasets that includes both synthetic data and the filtered publicly - available websites data with a focus on high-quality and reasoning - dense properties. More details about model can be found in `model - card `__, - `Microsoft blog `__ and `technical - report `__. -- **phi-3.5-mini-instruct** - Phi-3.5-mini is a lightweight, - state-of-the-art open model built upon datasets used for Phi-3 - - synthetic data and filtered publicly available websites - with a - focus on very high-quality, reasoning dense data. The model belongs - to the Phi-3 model family and supports 128K token context length. The - model underwent a rigorous enhancement process, incorporating both - supervised fine-tuning, proximal policy optimization, and direct - preference optimization to ensure precise instruction adherence and - robust safety measures. More details about model can be found in - `model - card `__, - `Microsoft blog `__ and `technical - report `__. -- **phi-4** - Phi-4 is 14B model that built upon a blend of synthetic - datasets, data from filtered public domain websites, and acquired - academic books and Q&A datasets. The goal of this approach was to - ensure that small capable models were trained with data focused on - high quality and advanced reasoning. Phi-4 underwent a rigorous - enhancement and alignment process, incorporating both supervised - fine-tuning and direct preference optimization to ensure precise - instruction adherence and robust safety measures. More details about - model can be found in - `model_card `__, `technical - report `__ and `Microsoft - blog `__. -- **red-pajama-3b-chat** - A 2.8B parameter pre-trained language model - based on GPT-NEOX architecture. It was developed by Together Computer - and leaders from the open-source AI community. The model is - fine-tuned on OASST1 and Dolly2 datasets to enhance chatting ability. - More details about model can be found in `HuggingFace model - card `__. -- **gemma-7b-it** - Gemma is a family of lightweight, state-of-the-art - open models from Google, built from the same research and technology - used to create the Gemini models. They are text-to-text, decoder-only - large language models, available in English, with open weights, - pre-trained variants, and instruction-tuned variants. Gemma models - are well-suited for a variety of text generation tasks, including - question answering, summarization, and reasoning. This model is - instruction-tuned version of 7B parameters model. More details about - model can be found in `model - card `__. >\ **Note**: run - model with demo, you will need to accept license agreement. >You must - be a registered user in Hugging Face Hub. Please visit - `HuggingFace model - card `__, carefully read - terms of usage and click accept button. You will need to use an - access token for the code below to run. For more information on - access tokens, refer to `this section of the - documentation `__. - >You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: python - - # login to huggingfacehub to get access to pretrained model - - from huggingface_hub import notebook_login, whoami - - try: - whoami() - print('Authorization token already provided') - except OSError: - notebook_login() - -- **gemma-2-9b-it** - Gemma2 is the second generation of a Gemma family - of lightweight, state-of-the-art open models from Google, built from - the same research and technology used to create the Gemini models. - They are text-to-text, decoder-only large language models, available - in English, with open weights, pre-trained variants, and - instruction-tuned variants. Gemma models are well-suited for a - variety of text generation tasks, including question answering, - summarization, and reasoning. This model is instruction-tuned version - of 9B parameters model. More details about model can be found in - `model card `__. - >\ **Note**: run model with demo, you will need to accept license - agreement. >You must be a registered user in Hugging Face Hub. - Please visit `HuggingFace model - card `__, carefully read - terms of usage and click accept button. You will need to use an - access token for the code below to run. For more information on - access tokens, refer to `this section of the - documentation `__. - >You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: python - - # login to huggingfacehub to get access to pretrained model - - - from huggingface_hub import notebook_login, whoami - - try: - whoami() - print('Authorization token already provided') - except OSError: - notebook_login() - -- **llama-2-7b-chat** - LLama 2 is the second generation of LLama - models developed by Meta. Llama 2 is a collection of pre-trained and - fine-tuned generative text models ranging in scale from 7 billion to - 70 billion parameters. llama-2-7b-chat is 7 billions parameters - version of LLama 2 finetuned and optimized for dialogue use case. - More details about model can be found in the - `paper `__, - `repository `__ and - `HuggingFace model - card `__. - >\ **Note**: run model with demo, you will need to accept license - agreement. >You must be a registered user in Hugging Face Hub. - Please visit `HuggingFace model - card `__, - carefully read terms of usage and click accept button. You will need - to use an access token for the code below to run. For more - information on access tokens, refer to `this section of the - documentation `__. - >You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: python - - # login to huggingfacehub to get access to pretrained model - - from huggingface_hub import notebook_login, whoami - - try: - whoami() - print('Authorization token already provided') - except OSError: - notebook_login() - -- **llama-3-8b-instruct** - Llama 3 is an auto-regressive language - model that uses an optimized transformer architecture. The tuned - versions use supervised fine-tuning (SFT) and reinforcement learning - with human feedback (RLHF) to align with human preferences for - helpfulness and safety. The Llama 3 instruction tuned models are - optimized for dialogue use cases and outperform many of the available - open source chat models on common industry benchmarks. More details - about model can be found in `Meta blog - post `__, `model - website `__ and `model - card `__. - >\ **Note**: run model with demo, you will need to accept license - agreement. >You must be a registered user in Hugging Face Hub. - Please visit `HuggingFace model - card `__, - carefully read terms of usage and click accept button. You will need - to use an access token for the code below to run. For more - information on access tokens, refer to `this section of the - documentation `__. - >You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: python - - # login to huggingfacehub to get access to pretrained model - - from huggingface_hub import notebook_login, whoami - - try: - whoami() - print('Authorization token already provided') - except OSError: - notebook_login() - -- **llama-3.1-8b-instruct** - The Llama 3.1 instruction tuned text only - models (8B, 70B, 405B) are optimized for multilingual dialogue use - cases and outperform many of the available open source and closed - chat models on common industry benchmarks. More details about model - can be found in `Meta blog - post `__, `model - website `__ and `model - card `__. - >\ **Note**: run model with demo, you will need to accept license - agreement. >You must be a registered user in Hugging Face Hub. - Please visit `HuggingFace model - card `__, - carefully read terms of usage and click accept button. You will need - to use an access token for the code below to run. For more - information on access tokens, refer to `this section of the - documentation `__. - >You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: python - - # login to huggingfacehub to get access to pretrained model - - from huggingface_hub import notebook_login, whoami - - try: - whoami() - print('Authorization token already provided') - except OSError: - notebook_login() - -- **qwen2.5-0.5b-instruct/qwen2.5-1.5b-instruct/qwen2.5-3b-instruct/qwen2.5-7b-instruct/qwen2.5-14b-instruct** - - Qwen2.5 is the latest series of Qwen large language models. - Comparing with Qwen2, Qwen2.5 series brings significant improvements - in coding, mathematics and general knowledge skills. Additionally, it - brings long-context and multiple languages support including Chinese, - English, French, Spanish, Portuguese, German, Italian, Russian, - Japanese, Korean, Vietnamese, Thai, Arabic, and more. For more - details, please refer to - `model_card `__, - `blog `__, - `GitHub `__, and - `Documentation `__. -- **qwen-7b-chat** - Qwen-7B is the 7B-parameter version of the large - language model series, Qwen (abbr. Tongyi Qianwen), proposed by - Alibaba Cloud. Qwen-7B is a Transformer-based large language model, - which is pretrained on a large volume of data, including web texts, - books, codes, etc. For more details about Qwen, please refer to the - `GitHub `__ code repository. -- **chatglm3-6b** - ChatGLM3-6B is the latest open-source model in the - ChatGLM series. While retaining many excellent features such as - smooth dialogue and low deployment threshold from the previous two - generations, ChatGLM3-6B employs a more diverse training dataset, - more sufficient training steps, and a more reasonable training - strategy. ChatGLM3-6B adopts a newly designed `Prompt - format `__, - in addition to the normal multi-turn dialogue. You can find more - details about model in the `model - card `__ -- **mistral-7b** - The Mistral-7B-v0.1 Large Language Model (LLM) is a - pretrained generative text model with 7 billion parameters. You can - find more details about model in the `model - card `__, - `paper `__ and `release blog - post `__. -- **zephyr-7b-beta** - Zephyr is a series of language models that are - trained to act as helpful assistants. Zephyr-7B-beta is the second - model in the series, and is a fine-tuned version of - `mistralai/Mistral-7B-v0.1 `__ - that was trained on on a mix of publicly available, synthetic - datasets using `Direct Preference Optimization - (DPO) `__. You can find more - details about model in `technical - report `__ and `HuggingFace model - card `__. -- **neural-chat-7b-v3-1** - Mistral-7b model fine-tuned using Intel - Gaudi. The model fine-tuned on the open source dataset - `Open-Orca/SlimOrca `__ - and aligned with `Direct Preference Optimization (DPO) - algorithm `__. More details can be - found in `model - card `__ and `blog - post `__. -- **notus-7b-v1** - Notus is a collection of fine-tuned models using - `Direct Preference Optimization - (DPO) `__. and related - `RLHF `__ techniques. This model is - the first version, fine-tuned with DPO over zephyr-7b-sft. Following - a data-first approach, the only difference between Notus-7B-v1 and - Zephyr-7B-beta is the preference dataset used for dDPO. Proposed - approach for dataset creation helps to effectively fine-tune Notus-7b - that surpasses Zephyr-7B-beta and Claude 2 on - `AlpacaEval `__. More - details about model can be found in `model - card `__. -- **youri-7b-chat** - Youri-7b-chat is a Llama2 based model. `Rinna - Co., Ltd. `__ conducted further pre-training - for the Llama2 model with a mixture of English and Japanese datasets - to improve Japanese task capability. The model is publicly released - on Hugging Face hub. You can find detailed information at the - `rinna/youri-7b-chat project - page `__. -- **baichuan2-7b-chat** - Baichuan 2 is the new generation of - large-scale open-source language models launched by `Baichuan - Intelligence inc `__. It is trained - on a high-quality corpus with 2.6 trillion tokens and has achieved - the best performance in authoritative Chinese and English benchmarks - of the same size. -- **internlm2-chat-1.8b** - InternLM2 is the second generation InternLM - series. Compared to the previous generation model, it shows - significant improvements in various capabilities, including - reasoning, mathematics, and coding. More details about model can be - found in `model repository `__. -- **glm-4-9b-chat** - GLM-4-9B is the open-source version of the latest - generation of pre-trained models in the GLM-4 series launched by - Zhipu AI. In the evaluation of data sets in semantics, mathematics, - reasoning, code, and knowledge, GLM-4-9B and its human - preference-aligned version GLM-4-9B-Chat have shown superior - performance beyond Llama-3-8B. In addition to multi-round - conversations, GLM-4-9B-Chat also has advanced features such as web - browsing, code execution, custom tool calls (Function Call), and long - text reasoning (supporting up to 128K context). More details about - model can be found in `model - card `__, - `technical report `__ and - `repository `__ -- **DeepSeek-R1-Distill-Qwen-1.5B** - Qwen2.5-1.5B fine-tuned using the - reasoning data generated by - `DeepSeek-R1 `__. You - can find more info in `model - card `__ -- **DeepSeek-R1-Distill-Qwen-7B** - Qwen2.5-7B fine-tuned using the - reasoning data generated by - `DeepSeek-R1 `__. You - can find more info in `model - card `__ -- **DeepSeek-R1-Distill-Llama-8B** - Llama-3.1-8B fine-tuned using the - reasoning data generated by - `DeepSeek-R1 `__. You - can find more info in `model - card `__ -
`__ -is the interface between the -`Transformers `__ and -`Diffusers `__ libraries -and OpenVINO to accelerate end-to-end pipelines on Intel architectures. -It provides ease-to-use cli interface for exporting models to `OpenVINO -Intermediate Representation -(IR) `__ -format. - -.. raw:: html - -
- -.. raw:: html - - - -Click here to read more about Optimum CLI usage - -.. raw:: html - - - -The command bellow demonstrates basic command for model export with -``optimum-cli`` - -:: - - optimum-cli export openvino --model --task - -where ``--model`` argument is model id from HuggingFace Hub or local -directory with model (saved using ``.save_pretrained`` method), -``--task`` is one of `supported -task `__ -that exported model should solve. For LLMs it is recommended to use -``text-generation-with-past``. If model initialization requires to use -remote code, ``--trust-remote-code`` flag additionally should be passed. - -.. raw:: html - -
- -Weights Compression using Optimum-CLI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -You can also apply fp16, 8-bit or 4-bit weight compression on the -Linear, Convolutional and Embedding layers when exporting your model -with the CLI. - -.. raw:: html - -
- -.. raw:: html - - - -Click here to read more about weights compression with Optimum CLI - -.. raw:: html - - - -Setting ``--weight-format`` to respectively fp16, int8 or int4. This -type of optimization allows to reduce the memory footprint and inference -latency. By default the quantization scheme for int8/int4 will be -`asymmetric `__, -to make it -`symmetric `__ -you can add ``--sym``. - -For INT4 quantization you can also specify the following arguments : - -- The ``--group-size`` parameter will define the group size to use for - quantization, -1 it will results in per-column quantization. -- The ``--ratio`` parameter controls the ratio between 4-bit and 8-bit - quantization. If set to 0.9, it means that 90% of the layers will be - quantized to int4 while 10% will be quantized to int8. - -Smaller group_size and ratio values usually improve accuracy at the -sacrifice of the model size and inference latency. You can enable AWQ to -be additionally applied during model export with INT4 precision using -``--awq`` flag and providing dataset name with ``--dataset``\ parameter -(e.g. ``--dataset wikitext2``) - - **Note**: Applying AWQ requires significant memory and time. - -.. - - **Note**: It is possible that there will be no matching patterns in - the model to apply AWQ, in such case it will be skipped. - -.. raw:: html - -
- -.. code:: ipython3 - - from llm_config import convert_and_compress_model - - model_dir = convert_and_compress_model(model_id, model_configuration, compression_variant.value, use_preconverted.value) - - -.. parsed-literal:: - - ✅ INT4 DeepSeek-R1-Distill-Llama-8B model already converted and can be found in DeepSeek-R1-Distill-Llama-8B/INT4_compressed_weights - - -Let’s compare model size for different compression types - -.. code:: ipython3 - - from llm_config import compare_model_size - - compare_model_size(model_dir) - - -.. parsed-literal:: - - Size of model with INT4 compressed weights is 5102.71 MB - - -Instantiate pipeline with OpenVINO Generate API ------------------------------------------------ - - - -`OpenVINO Generate -API `__ -can be used to create pipelines to run an inference with OpenVINO -Runtime. - -Firstly we need to create a pipeline with ``LLMPipeline``. -``LLMPipeline`` is the main object used for text generation using LLM in -OpenVINO GenAI API. You can construct it straight away from the folder -with the converted model. We will provide directory with model and -device for ``LLMPipeline``. Then we run ``generate`` method and get the -output in text format. Additionally, we can configure parameters for -decoding. We can create the default config with -``ov_genai.GenerationConfig()``, setup parameters, and apply the updated -version with ``set_generation_config(config)`` or put config directly to -``generate()``. It’s also possible to specify the needed options just as -inputs in the ``generate()`` method, as shown below, e.g. we can add -``max_new_tokens`` to stop generation if a specified number of tokens is -generated and the end of generation is not reached. We will discuss some -of the available generation parameters more deeply later. Generation -process for long response may be time consuming, for accessing partial -result as soon as it is generated without waiting when whole process -finished, Streaming API can be used. Token streaming is the mode in -which the generative system returns the tokens one by one as the model -generates them. This enables showing progressive generations to the user -rather than waiting for the whole generation. Streaming is an essential -aspect of the end-user experience as it reduces latency, one of the most -critical aspects of a smooth experience. In code below, we implement -simple streamer for printing output result. For more advanced streamer -example please check openvino.genai -`sample `__. - -.. code:: ipython3 - - import openvino_genai as ov_genai - import sys - - print(f"Loading model from {model_dir}\n") - - - pipe = ov_genai.LLMPipeline(str(model_dir), device.value) - - generation_config = ov_genai.GenerationConfig() - generation_config.max_new_tokens = 128 - - - def streamer(subword): - print(subword, end="", flush=True) - sys.stdout.flush() - # Return flag corresponds whether generation should be stopped. - # False means continue generation. - return False - - - input_prompt = "What is OpenVINO?" - print(f"Input text: {input_prompt}") - result = pipe.generate(input_prompt, generation_config, streamer) - - -.. parsed-literal:: - - Loading model from DeepSeek-R1-Distill-Llama-8B/INT4_compressed_weights - - Input text: What is OpenVINO? - It's a tool that helps in optimizing AI models for different platforms, including mobile and edge devices. It's designed to make AI deployment easier by providing tools for model optimization, quantization, and more. - - I remember that OpenVINO is an open-source toolkit. So, it's free to use and modify. - - It's used for optimizing AI models, which means it helps in making the models smaller and faster, so they can run on devices with limited resources. - - I think OpenVINO provides tools for model quantization, which is the process of reducing the precision of the model's weights to make them smaller, which helps in deploying them - -Run Chatbot ------------ - - - -Now, when model created, we can setup Chatbot interface using -`Gradio `__. - -.. raw:: html - -
- -.. raw:: html - - - -Click here to see how pipeline works - -.. raw:: html - - - -The diagram below illustrates how the chatbot pipeline works - -.. figure:: https://github.com/user-attachments/assets/9c9b56e1-01a6-48d8-aa46-222a88e25066 - :alt: llm_diagram - - llm_diagram - -As you can see, user input question passed via tokenizer to apply -chat-specific formatting (chat template) and turn the provided string -into the numeric format. `OpenVINO -Tokenizers `__ -are used for these purposes inside ``LLMPipeline``. You can find more -detailed info about tokenization theory and OpenVINO Tokenizers in this -`tutorial `__. -Then tokenized input passed to LLM for making prediction of next token -probability. The way the next token will be selected over predicted -probabilities is driven by the selected decoding methodology. You can -find more information about the most popular decoding methods in this -`blog `__. The sampler’s -goal is to select the next token id is driven by generation -configuration. Next, we apply stop generation condition to check the -generation is finished or not (e.g. if we reached the maximum new -generated tokens or the next token id equals to end of the generation). -If the end of the generation is not reached, then new generated token id -is used as the next iteration input, and the generation cycle repeats -until the condition is not met. When stop generation criteria are met, -then OpenVINO Detokenizer decodes generated token ids to text answer. - -The difference between chatbot and instruction-following pipelines is -that the model should have “memory” to find correct answers on the chain -of connected questions. OpenVINO GenAI uses ``KVCache`` representation -for maintain a history of conversation. By default, ``LLMPipeline`` -resets ``KVCache`` after each ``generate`` call. To keep conversational -history, we should move LLMPipeline to chat mode using ``start_chat()`` -method. - -More info about OpenVINO LLM inference can be found in `LLM Inference -Guide `__ - -.. raw:: html - -
- -Advanced generation options -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - - **Note**: NPU inference pipeline does not support providing advanced - options at this moment, they will be disabled if NPU selected - -.. raw:: html - -
- -.. raw:: html - - - -Click here to see detailed description of advanced options - -.. raw:: html - - - -| There are several parameters that can control text generation quality, - \* ``Temperature`` is a parameter used to control the level of - creativity in AI-generated text. By adjusting the ``temperature``, you - can influence the AI model’s probability distribution, making the text - more focused or diverse. -| Consider the following example: The AI model has to complete the - sentence “The cat is \____.” with the following token probabilities: - -| playing: 0.5 -| sleeping: 0.25 -| eating: 0.15 -| driving: 0.05 -| flying: 0.05 - -- **Low temperature** (e.g., 0.2): The AI model becomes more focused - and deterministic, choosing tokens with the highest probability, such - as “playing.” - - - **Medium temperature** (e.g., 1.0): The AI model maintains a - balance between creativity and focus, selecting tokens based on - their probabilities without significant bias, such as “playing,” - “sleeping,” or “eating.” - - **High temperature** (e.g., 2.0): The AI model becomes more - adventurous, increasing the chances of selecting less likely - tokens, such as “driving” and “flying.” - -- ``Top-p``, also known as nucleus sampling, is a parameter used to - control the range of tokens considered by the AI model based on their - cumulative probability. By adjusting the ``top-p`` value, you can - influence the AI model’s token selection, making it more focused or - diverse. Using the same example with the cat, consider the following - top_p settings: - - - **Low top_p** (e.g., 0.5): The AI model considers only tokens with - the highest cumulative probability, such as “playing.” - - **Medium top_p** (e.g., 0.8): The AI model considers tokens with a - higher cumulative probability, such as “playing,” “sleeping,” and - “eating.” - - **High top_p** (e.g., 1.0): The AI model considers all tokens, - including those with lower probabilities, such as “driving” and - “flying.” - -- ``Top-k`` is an another popular sampling strategy. In comparison with - Top-P, which chooses from the smallest possible set of words whose - cumulative probability exceeds the probability P, in Top-K sampling K - most likely next words are filtered and the probability mass is - redistributed among only those K next words. In our example with cat, - if k=3, then only “playing”, “sleeping” and “eating” will be taken - into account as possible next word. -- ``Repetition Penalty`` This parameter can help penalize tokens based - on how frequently they occur in the text, including the input prompt. - A token that has already appeared five times is penalized more - heavily than a token that has appeared only one time. A value of 1 - means that there is no penalty and values larger than 1 discourage - repeated tokens. - -.. raw:: html - -
- -.. code:: ipython3 - - if not Path("gradio_helper_genai.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llm-chatbot/gradio_helper_genai.py") - open("gradio_helper_genai.py", "w").write(r.text) - - from gradio_helper_genai import make_demo - - demo = make_demo(pipe, model_configuration, model_id, lang.value, device.value == "NPU") - - try: - demo.launch(debug=True) - except Exception: - demo.launch(debug=True, share=True) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ diff --git a/docs/notebooks/llm-chatbot-with-output.rst b/docs/notebooks/llm-chatbot-with-output.rst deleted file mode 100644 index 7aff2a589310a7..00000000000000 --- a/docs/notebooks/llm-chatbot-with-output.rst +++ /dev/null @@ -1,1414 +0,0 @@ -Create an LLM-powered Chatbot using OpenVINO -============================================ - -In the rapidly evolving world of artificial intelligence (AI), chatbots -have emerged as powerful tools for businesses to enhance customer -interactions and streamline operations. Large Language Models (LLMs) are -artificial intelligence systems that can understand and generate human -language. They use deep learning algorithms and massive amounts of data -to learn the nuances of language and produce coherent and relevant -responses. While a decent intent-based chatbot can answer basic, -one-touch inquiries like order management, FAQs, and policy questions, -LLM chatbots can tackle more complex, multi-touch questions. LLM enables -chatbots to provide support in a conversational manner, similar to how -humans do, through contextual memory. Leveraging the capabilities of -Language Models, chatbots are becoming increasingly intelligent, capable -of understanding and responding to human language with remarkable -accuracy. - -Previously, we already discussed how to build an instruction-following -pipeline using OpenVINO and Optimum Intel, please check out `Dolly -example `__ for reference. In this -tutorial, we consider how to use the power of OpenVINO for running Large -Language Models for chat. We will use a pre-trained model from the -`Hugging Face -Transformers `__ -library. To simplify the user experience, the `Hugging Face Optimum -Intel `__ library is -used to convert the models to OpenVINO™ IR format and to create -inference pipeline. The inference pipeline can also be created using -`OpenVINO Generate -API `__, -the example of that, please, see in the notebook `LLM chatbot with -OpenVINO Generate API `__ - -The tutorial consists of the following steps: - -- Install prerequisites -- Download and convert the model from a public source using the - `OpenVINO integration with Hugging Face - Optimum `__. -- Compress model weights to 4-bit or 8-bit data types using - `NNCF `__ -- Create a chat inference pipeline -- Run chat pipeline - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Select model for inference <#select-model-for-inference>`__ -- `Convert model using Optimum-CLI - tool <#convert-model-using-optimum-cli-tool>`__ -- `Compress model weights <#compress-model-weights>`__ - - - `Weights Compression using - Optimum-CLI <#weights-compression-using-optimum-cli>`__ - - `Weight compression with AWQ <#weight-compression-with-awq>`__ - -- `Select device for inference and model - variant <#select-device-for-inference-and-model-variant>`__ -- `Instantiate Model using Optimum - Intel <#instantiate-model-using-optimum-intel>`__ -- `Run Chatbot <#run-chatbot>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Install required dependencies - -.. code:: ipython3 - - import os - import platform - - os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - - %pip install -Uq pip - %pip uninstall -q -y optimum optimum-intel - %pip install --pre -Uq "openvino>=2024.2.0" openvino-tokenizers[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu\ - "git+https://github.com/huggingface/optimum-intel.git"\ - "nncf==2.14.1"\ - "torch>=2.1"\ - "datasets" \ - "accelerate" \ - "gradio>=4.19" \ - "huggingface-hub>=0.26.5" \ - "einops" "transformers>=4.43.1" "transformers_stream_generator" "tiktoken" "bitsandbytes" - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - -.. code:: ipython3 - - import os - from pathlib import Path - import requests - import shutil - - # fetch model configuration - - config_shared_path = Path("../../utils/llm_config.py") - config_dst_path = Path("llm_config.py") - - if not config_dst_path.exists(): - if config_shared_path.exists(): - try: - os.symlink(config_shared_path, config_dst_path) - except Exception: - shutil.copy(config_shared_path, config_dst_path) - else: - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py") - with open("llm_config.py", "w", encoding="utf-8") as f: - f.write(r.text) - elif not os.path.islink(config_dst_path): - print("LLM config will be updated") - if config_shared_path.exists(): - shutil.copy(config_shared_path, config_dst_path) - else: - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py") - with open("llm_config.py", "w", encoding="utf-8") as f: - f.write(r.text) - -Select model for inference --------------------------- - - - -The tutorial supports different models, you can select one from the -provided options to compare the quality of open source LLM solutions. ->\ **Note**: conversion of some models can require additional actions -from user side and at least 64GB RAM for conversion. - -.. raw:: html - -
- -Click here to see available models options - -- **tiny-llama-1b-chat** - This is the chat model finetuned on top of - `TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T `__. - The TinyLlama project aims to pretrain a 1.1B Llama model on 3 - trillion tokens with the adoption of the same architecture and - tokenizer as Llama 2. This means TinyLlama can be plugged and played - in many open-source projects built upon Llama. Besides, TinyLlama is - compact with only 1.1B parameters. This compactness allows it to - cater to a multitude of applications demanding a restricted - computation and memory footprint. More details about model can be - found in `model - card `__ -- **minicpm-2b-dpo** - MiniCPM is an End-Size LLM developed by - ModelBest Inc. and TsinghuaNLP, with only 2.4B parameters excluding - embeddings. After Direct Preference Optimization (DPO) fine-tuning, - MiniCPM outperforms many popular 7b, 13b and 70b models. More details - can be found in - `model_card `__. -- **minicpm3-4b** - MiniCPM3-4B is the 3rd generation of MiniCPM - series. The overall performance of MiniCPM3-4B surpasses - Phi-3.5-mini-Instruct, being comparable with many recent 7B~9B - models.Compared to previous generations, MiniCPM3-4B has a more - powerful and versatile skill set to enable more general usage. More - details can be found in `model - card `__. -- **gemma-2b-it** - Gemma is a family of lightweight, state-of-the-art - open models from Google, built from the same research and technology - used to create the Gemini models. They are text-to-text, decoder-only - large language models, available in English, with open weights, - pre-trained variants, and instruction-tuned variants. Gemma models - are well-suited for a variety of text generation tasks, including - question answering, summarization, and reasoning. This model is - instruction-tuned version of 2B parameters model. More details about - model can be found in `model - card `__. >\ **Note**: run - model with demo, you will need to accept license agreement. >You must - be a registered user in Hugging Face Hub. Please visit `HuggingFace - model card `__, carefully - read terms of usage and click accept button. You will need to use an - access token for the code below to run. For more information on - access tokens, refer to `this section of the - documentation `__. - >You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: python - - ## login to huggingfacehub to get access to pretrained model - - from huggingface_hub import notebook_login, whoami - - try: - whoami() - print('Authorization token already provided') - except OSError: - notebook_login() - -- **gemma-2-2b-it** - Gemma2 is the second generation of a Gemma family - of lightweight, state-of-the-art open models from Google, built from - the same research and technology used to create the Gemini models. - They are text-to-text, decoder-only large language models, available - in English, with open weights, pre-trained variants, and - instruction-tuned variants. Gemma models are well-suited for a - variety of text generation tasks, including question answering, - summarization, and reasoning. This model is instruction-tuned version - of 2B parameters model. More details about model can be found in - `model card `__. - >\ **Note**: run model with demo, you will need to accept license - agreement. >You must be a registered user in Hugging Face Hub. - Please visit `HuggingFace model - card `__, carefully read - terms of usage and click accept button. You will need to use an - access token for the code below to run. For more information on - access tokens, refer to `this section of the - documentation `__. - >You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: python - - # login to huggingfacehub to get access to pretrained model - - - from huggingface_hub import notebook_login, whoami - - try: - whoami() - print('Authorization token already provided') - except OSError: - notebook_login() - -- **phi-3-mini-instruct** - The Phi-3-Mini is a 3.8B parameters, - lightweight, state-of-the-art open model trained with the Phi-3 - datasets that includes both synthetic data and the filtered publicly - available websites data with a focus on high-quality and reasoning - dense properties. More details about model can be found in `model - card `__, - `Microsoft blog `__ and `technical - report `__. -- **phi-3.5-mini-instruct** - Phi-3.5-mini is a lightweight, - state-of-the-art open model built upon datasets used for Phi-3 - - synthetic data and filtered publicly available websites - with a - focus on very high-quality, reasoning dense data. The model belongs - to the Phi-3 model family and supports 128K token context length. The - model underwent a rigorous enhancement process, incorporating both - supervised fine-tuning, proximal policy optimization, and direct - preference optimization to ensure precise instruction adherence and - robust safety measures. More details about model can be found in - `model - card `__, - `Microsoft blog `__ and `technical - report `__. -- **phi-4** - Phi-4 is 14B model that built upon a blend of synthetic - datasets, data from filtered public domain websites, and acquired - academic books and Q&A datasets. The goal of this approach was to - ensure that small capable models were trained with data focused on - high quality and advanced reasoning. Phi-4 underwent a rigorous - enhancement and alignment process, incorporating both supervised - fine-tuning and direct preference optimization to ensure precise - instruction adherence and robust safety measures. More details about - model can be found in - `model_card `__, `technical - report `__ and `Microsoft - blog `__. -- **red-pajama-3b-chat** - A 2.8B parameter pre-trained language model - based on GPT-NEOX architecture. It was developed by Together Computer - and leaders from the open-source AI community. The model is - fine-tuned on OASST1 and Dolly2 datasets to enhance chatting ability. - More details about model can be found in `HuggingFace model - card `__. -- **gemma-7b-it** - Gemma is a family of lightweight, state-of-the-art - open models from Google, built from the same research and technology - used to create the Gemini models. They are text-to-text, decoder-only - large language models, available in English, with open weights, - pre-trained variants, and instruction-tuned variants. Gemma models - are well-suited for a variety of text generation tasks, including - question answering, summarization, and reasoning. This model is - instruction-tuned version of 7B parameters model. More details about - model can be found in `model - card `__. >\ **Note**: run - model with demo, you will need to accept license agreement. >You must - be a registered user in Hugging Face Hub. Please visit `HuggingFace - model card `__, carefully - read terms of usage and click accept button. You will need to use an - access token for the code below to run. For more information on - access tokens, refer to `this section of the - documentation `__. - >You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: python - - ## login to huggingfacehub to get access to pretrained model - - from huggingface_hub import notebook_login, whoami - - try: - whoami() - print('Authorization token already provided') - except OSError: - notebook_login() - -- **gemma-2-9b-it** - Gemma2 is the second generation of a Gemma family - of lightweight, state-of-the-art open models from Google, built from - the same research and technology used to create the Gemini models. - They are text-to-text, decoder-only large language models, available - in English, with open weights, pre-trained variants, and - instruction-tuned variants. Gemma models are well-suited for a - variety of text generation tasks, including question answering, - summarization, and reasoning. This model is instruction-tuned version - of 9B parameters model. More details about model can be found in - `model card `__. - >\ **Note**: run model with demo, you will need to accept license - agreement. >You must be a registered user in Hugging Face Hub. - Please visit `HuggingFace model - card `__, carefully read - terms of usage and click accept button. You will need to use an - access token for the code below to run. For more information on - access tokens, refer to `this section of the - documentation `__. - >You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: python - - # login to huggingfacehub to get access to pretrained model - - - from huggingface_hub import notebook_login, whoami - - try: - whoami() - print('Authorization token already provided') - except OSError: - notebook_login() - -- **llama-2-7b-chat** - LLama 2 is the second generation of LLama - models developed by Meta. Llama 2 is a collection of pre-trained and - fine-tuned generative text models ranging in scale from 7 billion to - 70 billion parameters. llama-2-7b-chat is 7 billions parameters - version of LLama 2 finetuned and optimized for dialogue use case. - More details about model can be found in the - `paper `__, - `repository `__ and - `HuggingFace model - card `__. - >\ **Note**: run model with demo, you will need to accept license - agreement. >You must be a registered user in Hugging Face Hub. - Please visit `HuggingFace model - card `__, - carefully read terms of usage and click accept button. You will need - to use an access token for the code below to run. For more - information on access tokens, refer to `this section of the - documentation `__. - >You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: python - - ## login to huggingfacehub to get access to pretrained model - - from huggingface_hub import notebook_login, whoami - - try: - whoami() - print('Authorization token already provided') - except OSError: - notebook_login() - -- **llama-3-8b-instruct** - Llama 3 is an auto-regressive language - model that uses an optimized transformer architecture. The tuned - versions use supervised fine-tuning (SFT) and reinforcement learning - with human feedback (RLHF) to align with human preferences for - helpfulness and safety. The Llama 3 instruction tuned models are - optimized for dialogue use cases and outperform many of the available - open source chat models on common industry benchmarks. More details - about model can be found in `Meta blog - post `__, `model - website `__ and `model - card `__. - >\ **Note**: run model with demo, you will need to accept license - agreement. >You must be a registered user in Hugging Face Hub. - Please visit `HuggingFace model - card `__, - carefully read terms of usage and click accept button. You will need - to use an access token for the code below to run. For more - information on access tokens, refer to `this section of the - documentation `__. - >You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: python - - ## login to huggingfacehub to get access to pretrained model - - from huggingface_hub import notebook_login, whoami - - try: - whoami() - print('Authorization token already provided') - except OSError: - notebook_login() - -- **llama-3.1-8b-instruct** - The Llama 3.1 instruction tuned text only - models (8B, 70B, 405B) are optimized for multilingual dialogue use - cases and outperform many of the available open source and closed - chat models on common industry benchmarks. More details about model - can be found in `Meta blog - post `__, `model - website `__ and `model - card `__. - >\ **Note**: run model with demo, you will need to accept license - agreement. >You must be a registered user in Hugging Face Hub. - Please visit `HuggingFace model - card `__, - carefully read terms of usage and click accept button. You will need - to use an access token for the code below to run. For more - information on access tokens, refer to `this section of the - documentation `__. - >You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: python - - ## login to huggingfacehub to get access to pretrained model - - from huggingface_hub import notebook_login, whoami - - try: - whoami() - print('Authorization token already provided') - except OSError: - notebook_login() - -- **qwen2.5-0.5b-instruct/qwen2.5-1.5b-instruct/qwen2.5-3b-instruct/qwen2.5-7b-instruct/qwen2.5-14b-instruct** - - Qwen2.5 is the latest series of Qwen large language models. - Comparing with Qwen2, Qwen2.5 series brings significant improvements - in coding, mathematics and general knowledge skills. Additionally, it - brings long-context and multiple languages support including Chinese, - English, French, Spanish, Portuguese, German, Italian, Russian, - Japanese, Korean, Vietnamese, Thai, Arabic, and more. For more - details, please refer to - `model_card `__, - `blog `__, - `GitHub `__, and - `Documentation `__. -- **qwen-7b-chat** - Qwen-7B is the 7B-parameter version of the large - language model series, Qwen (abbr. Tongyi Qianwen), proposed by - Alibaba Cloud. Qwen-7B is a Transformer-based large language model, - which is pretrained on a large volume of data, including web texts, - books, codes, etc. For more details about Qwen, please refer to the - `GitHub `__ code repository. -- **mpt-7b-chat** - MPT-7B is part of the family of - MosaicPretrainedTransformer (MPT) models, which use a modified - transformer architecture optimized for efficient training and - inference. These architectural changes include performance-optimized - layer implementations and the elimination of context length limits by - replacing positional embeddings with Attention with Linear Biases - (`ALiBi `__). Thanks to these - modifications, MPT models can be trained with high throughput - efficiency and stable convergence. MPT-7B-chat is a chatbot-like - model for dialogue generation. It was built by finetuning MPT-7B on - the - `ShareGPT-Vicuna `__, - `HC3 `__, - `Alpaca `__, - `HH-RLHF `__, and - `Evol-Instruct `__ - datasets. More details about the model can be found in `blog - post `__, - `repository `__ and - `HuggingFace model - card `__. -- **chatglm3-6b** - ChatGLM3-6B is the latest open-source model in the - ChatGLM series. While retaining many excellent features such as - smooth dialogue and low deployment threshold from the previous two - generations, ChatGLM3-6B employs a more diverse training dataset, - more sufficient training steps, and a more reasonable training - strategy. ChatGLM3-6B adopts a newly designed `Prompt - format `__, - in addition to the normal multi-turn dialogue. You can find more - details about model in the `model - card `__ -- **mistral-7b** - The Mistral-7B-v0.1 Large Language Model (LLM) is a - pretrained generative text model with 7 billion parameters. You can - find more details about model in the `model - card `__, - `paper `__ and `release blog - post `__. -- **zephyr-7b-beta** - Zephyr is a series of language models that are - trained to act as helpful assistants. Zephyr-7B-beta is the second - model in the series, and is a fine-tuned version of - `mistralai/Mistral-7B-v0.1 `__ - that was trained on on a mix of publicly available, synthetic - datasets using `Direct Preference Optimization - (DPO) `__. You can find more - details about model in `technical - report `__ and `HuggingFace model - card `__. -- **neural-chat-7b-v3-1** - Mistral-7b model fine-tuned using Intel - Gaudi. The model fine-tuned on the open source dataset - `Open-Orca/SlimOrca `__ - and aligned with `Direct Preference Optimization (DPO) - algorithm `__. More details can be - found in `model - card `__ and `blog - post `__. -- **notus-7b-v1** - Notus is a collection of fine-tuned models using - `Direct Preference Optimization - (DPO) `__. and related - `RLHF `__ techniques. This model is - the first version, fine-tuned with DPO over zephyr-7b-sft. Following - a data-first approach, the only difference between Notus-7B-v1 and - Zephyr-7B-beta is the preference dataset used for dDPO. Proposed - approach for dataset creation helps to effectively fine-tune Notus-7b - that surpasses Zephyr-7B-beta and Claude 2 on - `AlpacaEval `__. More - details about model can be found in `model - card `__. -- **youri-7b-chat** - Youri-7b-chat is a Llama2 based model. `Rinna - Co., Ltd. `__ conducted further pre-training - for the Llama2 model with a mixture of English and Japanese datasets - to improve Japanese task capability. The model is publicly released - on Hugging Face hub. You can find detailed information at the - `rinna/youri-7b-chat project - page `__. -- **baichuan2-7b-chat** - Baichuan 2 is the new generation of - large-scale open-source language models launched by `Baichuan - Intelligence inc `__. It is trained - on a high-quality corpus with 2.6 trillion tokens and has achieved - the best performance in authoritative Chinese and English benchmarks - of the same size. -- **internlm2-chat-1.8b** - InternLM2 is the second generation InternLM - series. Compared to the previous generation model, it shows - significant improvements in various capabilities, including - reasoning, mathematics, and coding. More details about model can be - found in `model repository `__. -- **glm-4-9b-chat** - GLM-4-9B is the open-source version of the latest - generation of pre-trained models in the GLM-4 series launched by - Zhipu AI. In the evaluation of data sets in semantics, mathematics, - reasoning, code, and knowledge, GLM-4-9B and its human - preference-aligned version GLM-4-9B-Chat have shown superior - performance beyond Llama-3-8B. In addition to multi-round - conversations, GLM-4-9B-Chat also has advanced features such as web - browsing, code execution, custom tool calls (Function Call), and long - text reasoning (supporting up to 128K context). More details about - model can be found in `model - card `__, - `technical report `__ and - `repository `__ -- **DeepSeek-R1-Distill-Qwen-1.5B** - Qwen2.5-1.5B fine-tuned using the - reasoning data generated by - `DeepSeek-R1 `__. You - can find more info in `model - card `__ -- **DeepSeek-R1-Distill-Qwen-7B** - Qwen2.5-7B fine-tuned using the - reasoning data generated by - `DeepSeek-R1 `__. You - can find more info in `model - card `__ -- **DeepSeek-R1-Distill-Llama-8B** - Llama-3.1-8B fine-tuned using the - reasoning data generated by - `DeepSeek-R1 `__. You - can find more info in `model - card `__ - -.. code:: ipython3 - - from llm_config import SUPPORTED_LLM_MODELS - import ipywidgets as widgets - -.. code:: ipython3 - - model_languages = list(SUPPORTED_LLM_MODELS) - - model_language = widgets.Dropdown( - options=model_languages, - value=model_languages[0], - description="Model Language:", - disabled=False, - ) - - model_language - - - - -.. parsed-literal:: - - Dropdown(description='Model Language:', options=('English', 'Chinese', 'Japanese'), value='English') - - - -.. code:: ipython3 - - model_ids = list(SUPPORTED_LLM_MODELS[model_language.value]) - - model_id = widgets.Dropdown( - options=model_ids, - value=model_ids[0], - description="Model:", - disabled=False, - ) - - model_id - - - - -.. parsed-literal:: - - Dropdown(description='Model:', options=('qwen2-0.5b-instruct', 'tiny-llama-1b-chat', 'qwen2-1.5b-instruct', 'g… - - - -.. code:: ipython3 - - model_configuration = SUPPORTED_LLM_MODELS[model_language.value][model_id.value] - print(f"Selected model {model_id.value}") - - -.. parsed-literal:: - - Selected model qwen2-0.5b-instruct - - -Convert model using Optimum-CLI tool ------------------------------------- - - - -`Optimum Intel `__ is -the interface between the -`Transformers `__ and -`Diffusers `__ libraries -and OpenVINO to accelerate end-to-end pipelines on Intel architectures. -It provides ease-to-use cli interface for exporting models to `OpenVINO -Intermediate Representation -(IR) `__ -format. - -The command bellow demonstrates basic command for model export with -``optimum-cli`` - -:: - - optimum-cli export openvino --model --task - -where ``--model`` argument is model id from HuggingFace Hub or local -directory with model (saved using ``.save_pretrained`` method), -``--task`` is one of `supported -task `__ -that exported model should solve. For LLMs it will be -``text-generation-with-past``. If model initialization requires to use -remote code, ``--trust-remote-code`` flag additionally should be passed. - -Compress model weights ----------------------- - -The `Weights -Compression `__ -algorithm is aimed at compressing the weights of the models and can be -used to optimize the model footprint and performance of large models -where the size of weights is relatively larger than the size of -activations, for example, Large Language Models (LLM). Compared to INT8 -compression, INT4 compression improves performance even more, but -introduces a minor drop in prediction quality. - -Weights Compression using Optimum-CLI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -You can also apply fp16, 8-bit or 4-bit weight compression on the -Linear, Convolutional and Embedding layers when exporting your model -with the CLI by setting ``--weight-format`` to respectively fp16, int8 -or int4. This type of optimization allows to reduce the memory footprint -and inference latency. By default the quantization scheme for int8/int4 -will be -`asymmetric `__, -to make it -`symmetric `__ -you can add ``--sym``. - -For INT4 quantization you can also specify the following arguments : - -- The ``--group-size`` parameter will define the group size to use for - quantization, -1 it will results in per-column quantization. -- The ``--ratio`` parameter controls the ratio between 4-bit and 8-bit - quantization. If set to 0.9, it means that 90% of the layers will be - quantized to int4 while 10% will be quantized to int8. - -Smaller group_size and ratio values usually improve accuracy at the -sacrifice of the model size and inference latency. - - **Note**: There may be no speedup for INT4/INT8 compressed models on - dGPU. - -.. code:: ipython3 - - from IPython.display import Markdown, display - - prepare_int4_model = widgets.Checkbox( - value=True, - description="Prepare INT4 model", - disabled=False, - ) - prepare_int8_model = widgets.Checkbox( - value=False, - description="Prepare INT8 model", - disabled=False, - ) - prepare_fp16_model = widgets.Checkbox( - value=False, - description="Prepare FP16 model", - disabled=False, - ) - - display(prepare_int4_model) - display(prepare_int8_model) - display(prepare_fp16_model) - - - -.. parsed-literal:: - - Checkbox(value=True, description='Prepare INT4 model') - - - -.. parsed-literal:: - - Checkbox(value=False, description='Prepare INT8 model') - - - -.. parsed-literal:: - - Checkbox(value=False, description='Prepare FP16 model') - - -Weight compression with AWQ -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -`Activation-aware Weight -Quantization `__ (AWQ) is an algorithm -that tunes model weights for more accurate INT4 compression. It slightly -improves generation quality of compressed LLMs, but requires significant -additional time for tuning weights on a calibration dataset. We use -``wikitext-2-raw-v1/train`` subset of the -`Wikitext `__ -dataset for calibration. - -Below you can enable AWQ to be additionally applied during model export -with INT4 precision. - - **Note**: Applying AWQ requires significant memory and time. - -.. - - **Note**: It is possible that there will be no matching patterns in - the model to apply AWQ, in such case it will be skipped. - -.. code:: ipython3 - - enable_awq = widgets.Checkbox( - value=False, - description="Enable AWQ", - disabled=not prepare_int4_model.value, - ) - display(enable_awq) - - - -.. parsed-literal:: - - Checkbox(value=False, description='Enable AWQ') - - -We can now save floating point and compressed model variants - -.. code:: ipython3 - - from pathlib import Path - - pt_model_id = model_configuration["model_id"] - pt_model_name = model_id.value.split("-")[0] - fp16_model_dir = Path(model_id.value) / "FP16" - int8_model_dir = Path(model_id.value) / "INT8_compressed_weights" - int4_model_dir = Path(model_id.value) / "INT4_compressed_weights" - - - def convert_to_fp16(): - if (fp16_model_dir / "openvino_model.xml").exists(): - return - remote_code = model_configuration.get("remote_code", False) - export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format fp16".format(pt_model_id) - if remote_code: - export_command_base += " --trust-remote-code" - export_command = export_command_base + " " + str(fp16_model_dir) - display(Markdown("**Export command:**")) - display(Markdown(f"`{export_command}`")) - ! $export_command - - - def convert_to_int8(): - if (int8_model_dir / "openvino_model.xml").exists(): - return - int8_model_dir.mkdir(parents=True, exist_ok=True) - remote_code = model_configuration.get("remote_code", False) - export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format int8".format(pt_model_id) - if remote_code: - export_command_base += " --trust-remote-code" - export_command = export_command_base + " " + str(int8_model_dir) - display(Markdown("**Export command:**")) - display(Markdown(f"`{export_command}`")) - ! $export_command - - - def convert_to_int4(): - compression_configs = { - "zephyr-7b-beta": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "mistral-7b": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "minicpm-2b-dpo": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "gemma-2b-it": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "notus-7b-v1": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "neural-chat-7b-v3-1": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "llama-2-chat-7b": { - "sym": True, - "group_size": 128, - "ratio": 0.8, - }, - "llama-3-8b-instruct": { - "sym": True, - "group_size": 128, - "ratio": 0.8, - }, - "llama-3.1-8b-instruct": { - "sym": True, - "group_size": 128, - "ratio": 1.0, - }, - "gemma-7b-it": { - "sym": True, - "group_size": 128, - "ratio": 0.8, - }, - "chatglm2-6b": { - "sym": True, - "group_size": 128, - "ratio": 0.72, - }, - "qwen-7b-chat": {"sym": True, "group_size": 128, "ratio": 0.6}, - "red-pajama-3b-chat": { - "sym": False, - "group_size": 128, - "ratio": 0.5, - }, - "qwen2.5-7b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "qwen2.5-3b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "qwen2.5-14b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "qwen2.5-1.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "default": { - "sym": False, - "group_size": 128, - "ratio": 0.8, - }, - } - - model_compression_params = compression_configs.get(model_id.value, compression_configs["default"]) - if (int4_model_dir / "openvino_model.xml").exists(): - return - remote_code = model_configuration.get("remote_code", False) - export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format int4".format(pt_model_id) - int4_compression_args = " --group-size {} --ratio {}".format(model_compression_params["group_size"], model_compression_params["ratio"]) - if model_compression_params["sym"]: - int4_compression_args += " --sym" - if enable_awq.value: - int4_compression_args += " --awq --dataset wikitext2 --num-samples 128" - export_command_base += int4_compression_args - if remote_code: - export_command_base += " --trust-remote-code" - export_command = export_command_base + " " + str(int4_model_dir) - display(Markdown("**Export command:**")) - display(Markdown(f"`{export_command}`")) - ! $export_command - - - if prepare_fp16_model.value: - convert_to_fp16() - if prepare_int8_model.value: - convert_to_int8() - if prepare_int4_model.value: - convert_to_int4() - -Let’s compare model size for different compression types - -.. code:: ipython3 - - fp16_weights = fp16_model_dir / "openvino_model.bin" - int8_weights = int8_model_dir / "openvino_model.bin" - int4_weights = int4_model_dir / "openvino_model.bin" - - if fp16_weights.exists(): - print(f"Size of FP16 model is {fp16_weights.stat().st_size / 1024 / 1024:.2f} MB") - for precision, compressed_weights in zip([8, 4], [int8_weights, int4_weights]): - if compressed_weights.exists(): - print(f"Size of model with INT{precision} compressed weights is {compressed_weights.stat().st_size / 1024 / 1024:.2f} MB") - if compressed_weights.exists() and fp16_weights.exists(): - print(f"Compression rate for INT{precision} model: {fp16_weights.stat().st_size / compressed_weights.stat().st_size:.3f}") - - -.. parsed-literal:: - - Size of model with INT4 compressed weights is 358.86 MB - - -Select device for inference and model variant ---------------------------------------------- - - - - **Note**: There may be no speedup for INT4/INT8 compressed models on - dGPU. - -.. code:: ipython3 - - import requests - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import device_widget - - device = device_widget("CPU", exclude=["NPU"]) - - device - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("llm-chatbot.ipynb") - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -The cell below demonstrates how to instantiate model based on selected -variant of model weights and inference device - -.. code:: ipython3 - - available_models = [] - if int4_model_dir.exists(): - available_models.append("INT4") - if int8_model_dir.exists(): - available_models.append("INT8") - if fp16_model_dir.exists(): - available_models.append("FP16") - - model_to_run = widgets.Dropdown( - options=available_models, - value=available_models[0], - description="Model to run:", - disabled=False, - ) - - model_to_run - - - - -.. parsed-literal:: - - Dropdown(description='Model to run:', options=('INT4',), value='INT4') - - - -Instantiate Model using Optimum Intel -------------------------------------- - - - -Optimum Intel can be used to load optimized models from the `Hugging -Face Hub `__ and -create pipelines to run an inference with OpenVINO Runtime using Hugging -Face APIs. The Optimum Inference models are API compatible with Hugging -Face Transformers models. This means we just need to replace -``AutoModelForXxx`` class with the corresponding ``OVModelForXxx`` -class. - -Below is an example of the RedPajama model - -.. code:: diff - - -from transformers import AutoModelForCausalLM - +from optimum.intel.openvino import OVModelForCausalLM - from transformers import AutoTokenizer, pipeline - - model_id = "togethercomputer/RedPajama-INCITE-Chat-3B-v1" - -model = AutoModelForCausalLM.from_pretrained(model_id) - +model = OVModelForCausalLM.from_pretrained(model_id, export=True) - -Model class initialization starts with calling ``from_pretrained`` -method. When downloading and converting Transformers model, the -parameter ``export=True`` should be added (as we already converted model -before, we do not need to provide this parameter). We can save the -converted model for the next usage with the ``save_pretrained`` method. -Tokenizer class and pipelines API are compatible with Optimum models. - -You can find more details about OpenVINO LLM inference using HuggingFace -Optimum API in `LLM inference -guide `__. - -.. code:: ipython3 - - from transformers import AutoConfig, AutoTokenizer - from optimum.intel.openvino import OVModelForCausalLM - - import openvino as ov - import openvino.properties as props - import openvino.properties.hint as hints - import openvino.properties.streams as streams - - - if model_to_run.value == "INT4": - model_dir = int4_model_dir - elif model_to_run.value == "INT8": - model_dir = int8_model_dir - else: - model_dir = fp16_model_dir - print(f"Loading model from {model_dir}") - - ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""} - - if "GPU" in device.value and "qwen2-7b-instruct" in model_id.value: - ov_config["GPU_ENABLE_SDPA_OPTIMIZATION"] = "NO" - - # On a GPU device a model is executed in FP16 precision. For red-pajama-3b-chat model there known accuracy - # issues caused by this, which we avoid by setting precision hint to "f32". - core = ov.Core() - - if model_id.value == "red-pajama-3b-chat" and "GPU" in core.available_devices and device.value in ["GPU", "AUTO"]: - ov_config["INFERENCE_PRECISION_HINT"] = "f32" - - model_name = model_configuration["model_id"] - tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) - - ov_model = OVModelForCausalLM.from_pretrained( - model_dir, - device=device.value, - ov_config=ov_config, - config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True), - trust_remote_code=True, - ) - - -.. parsed-literal:: - - Loading model from qwen2-0.5b-instruct/INT4_compressed_weights - - -.. parsed-literal:: - - Compiling the model to CPU ... - - -.. code:: ipython3 - - tokenizer_kwargs = model_configuration.get("tokenizer_kwargs", {}) - test_string = "2 + 2 =" - input_tokens = tok(test_string, return_tensors="pt", **tokenizer_kwargs) - answer = ov_model.generate(**input_tokens, max_new_tokens=2) - print(tok.batch_decode(answer, skip_special_tokens=True)[0]) - - -.. parsed-literal:: - - 2 + 2 = 4 - - -Run Chatbot ------------ - - - -Now, when model created, we can setup Chatbot interface using -`Gradio `__. The diagram below illustrates how -the chatbot pipeline works - -.. figure:: https://user-images.githubusercontent.com/29454499/255523209-d9336491-c7ba-4dc1-98f0-07f23743ce89.png - :alt: generation pipeline - - generation pipeline - -As can be seen, the pipeline very similar to instruction-following with -only changes that previous conversation history additionally passed as -input with next user question for getting wider input context. On the -first iteration, the user provided instructions joined to conversation -history (if exists) converted to token ids using a tokenizer, then -prepared input provided to the model. The model generates probabilities -for all tokens in logits format The way the next token will be selected -over predicted probabilities is driven by the selected decoding -methodology. You can find more information about the most popular -decoding methods in this -`blog `__. The result -generation updates conversation history for next conversation step. it -makes stronger connection of next question with previously provided and -allows user to make clarifications regarding previously provided -answers.https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html - -| There are several parameters that can control text generation quality: - \* ``Temperature`` is a parameter used to control the level of - creativity in AI-generated text. By adjusting the ``temperature``, you - can influence the AI model’s probability distribution, making the text - more focused or diverse. -| Consider the following example: The AI model has to complete the - sentence “The cat is \____.” with the following token probabilities: - -:: - - playing: 0.5 - sleeping: 0.25 - eating: 0.15 - driving: 0.05 - flying: 0.05 - - - **Low temperature** (e.g., 0.2): The AI model becomes more focused and deterministic, choosing tokens with the highest probability, such as "playing." - - **Medium temperature** (e.g., 1.0): The AI model maintains a balance between creativity and focus, selecting tokens based on their probabilities without significant bias, such as "playing," "sleeping," or "eating." - - **High temperature** (e.g., 2.0): The AI model becomes more adventurous, increasing the chances of selecting less likely tokens, such as "driving" and "flying." - -- ``Top-p``, also known as nucleus sampling, is a parameter used to - control the range of tokens considered by the AI model based on their - cumulative probability. By adjusting the ``top-p`` value, you can - influence the AI model’s token selection, making it more focused or - diverse. Using the same example with the cat, consider the following - top_p settings: - - - **Low top_p** (e.g., 0.5): The AI model considers only tokens with - the highest cumulative probability, such as “playing.” - - **Medium top_p** (e.g., 0.8): The AI model considers tokens with a - higher cumulative probability, such as “playing,” “sleeping,” and - “eating.” - - **High top_p** (e.g., 1.0): The AI model considers all tokens, - including those with lower probabilities, such as “driving” and - “flying.” - -- ``Top-k`` is an another popular sampling strategy. In comparison with - Top-P, which chooses from the smallest possible set of words whose - cumulative probability exceeds the probability P, in Top-K sampling K - most likely next words are filtered and the probability mass is - redistributed among only those K next words. In our example with cat, - if k=3, then only “playing”, “sleeping” and “eating” will be taken - into account as possible next word. -- ``Repetition Penalty`` This parameter can help penalize tokens based - on how frequently they occur in the text, including the input prompt. - A token that has already appeared five times is penalized more - heavily than a token that has appeared only one time. A value of 1 - means that there is no penalty and values larger than 1 discourage - repeated - tokens.https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html - -.. code:: ipython3 - - import torch - from threading import Event, Thread - - from typing import List, Tuple - from transformers import ( - AutoTokenizer, - StoppingCriteria, - StoppingCriteriaList, - TextIteratorStreamer, - ) - - - model_name = model_configuration["model_id"] - start_message = model_configuration["start_message"] - history_template = model_configuration.get("history_template") - has_chat_template = model_configuration.get("has_chat_template", history_template is None) - current_message_template = model_configuration.get("current_message_template") - stop_tokens = model_configuration.get("stop_tokens") - tokenizer_kwargs = model_configuration.get("tokenizer_kwargs", {}) - - max_new_tokens = 256 - - - class StopOnTokens(StoppingCriteria): - def __init__(self, token_ids): - self.token_ids = token_ids - - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: - for stop_id in self.token_ids: - if input_ids[0][-1] == stop_id: - return True - return False - - - if stop_tokens is not None: - if isinstance(stop_tokens[0], str): - stop_tokens = tok.convert_tokens_to_ids(stop_tokens) - - stop_tokens = [StopOnTokens(stop_tokens)] - - - def default_partial_text_processor(partial_text: str, new_text: str): - """ - helper for updating partially generated answer, used by default - - Params: - partial_text: text buffer for storing previosly generated text - new_text: text update for the current step - Returns: - updated text string - - """ - partial_text += new_text - return partial_text - - - text_processor = model_configuration.get("partial_text_processor", default_partial_text_processor) - - - def convert_history_to_token(history: List[Tuple[str, str]]): - """ - function for conversion history stored as list pairs of user and assistant messages to tokens according to model expected conversation template - Params: - history: dialogue history - Returns: - history in token format - """ - if pt_model_name == "baichuan2": - system_tokens = tok.encode(start_message) - history_tokens = [] - for old_query, response in history[:-1]: - round_tokens = [] - round_tokens.append(195) - round_tokens.extend(tok.encode(old_query)) - round_tokens.append(196) - round_tokens.extend(tok.encode(response)) - history_tokens = round_tokens + history_tokens - input_tokens = system_tokens + history_tokens - input_tokens.append(195) - input_tokens.extend(tok.encode(history[-1][0])) - input_tokens.append(196) - input_token = torch.LongTensor([input_tokens]) - elif history_template is None or has_chat_template: - messages = [{"role": "system", "content": start_message}] - for idx, (user_msg, model_msg) in enumerate(history): - if idx == len(history) - 1 and not model_msg: - messages.append({"role": "user", "content": user_msg}) - break - if user_msg: - messages.append({"role": "user", "content": user_msg}) - if model_msg: - messages.append({"role": "assistant", "content": model_msg}) - - input_token = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_tensors="pt") - else: - text = start_message + "".join( - ["".join([history_template.format(num=round, user=item[0], assistant=item[1])]) for round, item in enumerate(history[:-1])] - ) - text += "".join( - [ - "".join( - [ - current_message_template.format( - num=len(history) + 1, - user=history[-1][0], - assistant=history[-1][1], - ) - ] - ) - ] - ) - input_token = tok(text, return_tensors="pt", **tokenizer_kwargs).input_ids - return input_token - - - def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id): - """ - callback function for running chatbot on submit button click - - Params: - history: conversation history - temperature: parameter for control the level of creativity in AI-generated text. - By adjusting the `temperature`, you can influence the AI model's probability distribution, making the text more focused or diverse. - top_p: parameter for control the range of tokens considered by the AI model based on their cumulative probability. - top_k: parameter for control the range of tokens considered by the AI model based on their cumulative probability, selecting number of tokens with highest probability. - repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text. - conversation_id: unique conversation identifier. - - """ - - # Construct the input message string for the model by concatenating the current system message and conversation history - # Tokenize the messages string - input_ids = convert_history_to_token(history) - if input_ids.shape[1] > 2000: - history = [history[-1]] - input_ids = convert_history_to_token(history) - streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True) - generate_kwargs = dict( - input_ids=input_ids, - max_new_tokens=max_new_tokens, - temperature=temperature, - do_sample=temperature > 0.0, - top_p=top_p, - top_k=top_k, - repetition_penalty=repetition_penalty, - streamer=streamer, - ) - if stop_tokens is not None: - generate_kwargs["stopping_criteria"] = StoppingCriteriaList(stop_tokens) - - stream_complete = Event() - - def generate_and_signal_complete(): - """ - genration function for single thread - """ - global start_time - ov_model.generate(**generate_kwargs) - stream_complete.set() - - t1 = Thread(target=generate_and_signal_complete) - t1.start() - - # Initialize an empty string to store the generated text - partial_text = "" - for new_text in streamer: - partial_text = text_processor(partial_text, new_text) - history[-1][1] = partial_text - yield history - - - def request_cancel(): - ov_model.request.cancel() - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llm-chatbot/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(run_fn=bot, stop_fn=request_cancel, title=f"OpenVINO {model_id.value} Chatbot", language=model_language.value) - - try: - demo.launch() - except Exception: - demo.launch(share=True) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ - -.. code:: ipython3 - - # please uncomment and run this cell for stopping gradio interface - # demo.close() - -Next Step -~~~~~~~~~ - -Besides chatbot, we can use LangChain to augmenting LLM knowledge with -additional data, which allow you to build AI applications that can -reason about private data or data introduced after a model’s cutoff -date. You can find this solution in `Retrieval-augmented generation -(RAG) example <-with-output.html>`__. diff --git a/docs/notebooks/llm-question-answering-with-output.rst b/docs/notebooks/llm-question-answering-with-output.rst deleted file mode 100644 index 8519ca1df866bb..00000000000000 --- a/docs/notebooks/llm-question-answering-with-output.rst +++ /dev/null @@ -1,877 +0,0 @@ -LLM Instruction-following pipeline with OpenVINO -================================================ - -LLM stands for “Large Language Model,” which refers to a type of -artificial intelligence model that is designed to understand and -generate human-like text based on the input it receives. LLMs are -trained on large datasets of text to learn patterns, grammar, and -semantic relationships, allowing them to generate coherent and -contextually relevant responses. One core capability of Large Language -Models (LLMs) is to follow natural language instructions. -Instruction-following models are capable of generating text in response -to prompts and are often used for tasks like writing assistance, -chatbots, and content generation. - -In this tutorial, we consider how to run an instruction-following text -generation pipeline using popular LLMs and OpenVINO. We will use -pre-trained models from the `Hugging Face -Transformers `__ -library. The `Hugging Face Optimum -Intel `__ library -converts the models to OpenVINO™ IR format. To simplify the user -experience, we will use `OpenVINO Generate -API `__ for -generation of instruction-following inference pipeline. - -The tutorial consists of the following steps: - -- Install prerequisites -- Download and convert the model from a public source using the - `OpenVINO integration with Hugging Face - Optimum `__. -- Compress model weights to INT8 and INT4 with `OpenVINO - NNCF `__ -- Create an instruction-following inference pipeline with `Generate - API `__ -- Run instruction-following pipeline - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Select model for inference <#select-model-for-inference>`__ -- `Download and convert model to OpenVINO IR via Optimum Intel - CLI <#download-and-convert-model-to-openvino-ir-via-optimum-intel-cli>`__ -- `Compress model weights <#compress-model-weights>`__ - - - `Weights Compression using Optimum Intel - CLI <#weights-compression-using-optimum-intel-cli>`__ - - `Weights Compression using - NNCF <#weights-compression-using-nncf>`__ - -- `Select device for inference and model - variant <#select-device-for-inference-and-model-variant>`__ -- `Create an instruction-following inference - pipeline <#create-an-instruction-following-inference-pipeline>`__ - - - `Setup imports <#setup-imports>`__ - - `Prepare text streamer to get results - runtime <#prepare-text-streamer-to-get-results-runtime>`__ - - `Main generation function <#main-generation-function>`__ - - `Helpers for application <#helpers-for-application>`__ - -- `Run instruction-following - pipeline <#run-instruction-following-pipeline>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip uninstall -q -y optimum optimum-intel - %pip install -Uq "openvino>=2024.3.0" "openvino-genai" - %pip install -q "torch>=2.1" "nncf>=2.7" "transformers>=4.40.0" "huggingface-hub>=0.26.5" "onnx<1.16.2" "optimum>=1.16.1" "accelerate" "datasets>=2.14.6" "gradio>=4.19" "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu - -Select model for inference --------------------------- - - - -The tutorial supports different models, you can select one from the -provided options to compare the quality of open source LLM solutions. ->\ **Note**: conversion of some models can require additional actions -from user side and at least 64GB RAM for conversion. - -The available options are: - -- **tiny-llama-1b-chat** - This is the chat model finetuned on top of - `TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T `__. - The TinyLlama project aims to pretrain a 1.1B Llama model on 3 - trillion tokens with the adoption of the same architecture and - tokenizer as Llama 2. This means TinyLlama can be plugged and played - in many open-source projects built upon Llama. Besides, TinyLlama is - compact with only 1.1B parameters. This compactness allows it to - cater to a multitude of applications demanding a restricted - computation and memory footprint. More details about model can be - found in `model - card `__ -- **phi-2** - Phi-2 is a Transformer with 2.7 billion parameters. It - was trained using the same data sources as - `Phi-1.5 `__, augmented - with a new data source that consists of various NLP synthetic texts - and filtered websites (for safety and educational value). When - assessed against benchmarks testing common sense, language - understanding, and logical reasoning, Phi-2 showcased a nearly - state-of-the-art performance among models with less than 13 billion - parameters. More details about model can be found in `model - card `__. -- **dolly-v2-3b** - Dolly 2.0 is an instruction-following large - language model trained on the Databricks machine-learning platform - that is licensed for commercial use. It is based on - `Pythia `__ and is trained on - ~15k instruction/response fine-tuning records generated by Databricks - employees in various capability domains, including brainstorming, - classification, closed QA, generation, information extraction, open - QA, and summarization. Dolly 2.0 works by processing natural language - instructions and generating responses that follow the given - instructions. It can be used for a wide range of applications, - including closed question-answering, summarization, and generation. - More details about model can be found in `model - card `__. -- **red-pajama-3b-instruct** - A 2.8B parameter pre-trained language - model based on GPT-NEOX architecture. The model was fine-tuned for - few-shot applications on the data of - `GPT-JT `__, - with exclusion of tasks that overlap with the HELM core - scenarios.More details about model can be found in `model - card `__. -- **mistral-7b** - The Mistral-7B-v0.2 Large Language Model (LLM) is a - pretrained generative text model with 7 billion parameters. You can - find more details about model in the `model - card `__, - `paper `__ and `release blog - post `__. -- **llama-3-8b-instruct** - Llama 3 is an auto-regressive language - model that uses an optimized transformer architecture. The tuned - versions use supervised fine-tuning (SFT) and reinforcement learning - with human feedback (RLHF) to align with human preferences for - helpfulness and safety. The Llama 3 instruction tuned models are - optimized for dialogue use cases and outperform many of the available - open source chat models on common industry benchmarks. More details - about model can be found in `Meta blog - post `__, `model - website `__ and `model - card `__. - >\ **Note**: run model with demo, you will need to accept license - agreement. >You must be a registered user in Hugging Face Hub. - Please visit `HuggingFace model - card `__, - carefully read terms of usage and click accept button. You will need - to use an access token for the code below to run. For more - information on access tokens, refer to `this section of the - documentation `__. - >You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: python - - ## login to huggingfacehub to get access to pretrained model - - from huggingface_hub import notebook_login, whoami - - try: - whoami() - print('Authorization token already provided') - except OSError: - notebook_login() - -.. code:: ipython3 - - from pathlib import Path - import requests - - if not Path("notebook_utils.py").exists(): - # Fetch `notebook_utils` module - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file, device_widget - - if not Path("./config.py").exists(): - download_file(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llm-question-answering/config.py") - from config import SUPPORTED_LLM_MODELS - import ipywidgets as widgets - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("llm-question-answering.ipynb") - -.. code:: ipython3 - - model_ids = list(SUPPORTED_LLM_MODELS) - - model_id = widgets.Dropdown( - options=model_ids, - value=model_ids[1], - description="Model:", - disabled=False, - ) - - model_id - - - - -.. parsed-literal:: - - Dropdown(description='Model:', index=1, options=('tiny-llama-1b', 'phi-2', 'dolly-v2-3b', 'red-pajama-instruct… - - - -.. code:: ipython3 - - model_configuration = SUPPORTED_LLM_MODELS[model_id.value] - print(f"Selected model {model_id.value}") - - -.. parsed-literal:: - - Selected model dolly-v2-3b - - -Download and convert model to OpenVINO IR via Optimum Intel CLI ---------------------------------------------------------------- - - - -Listed model are available for downloading via the `HuggingFace -hub `__. We will use optimum-cli -interface for exporting it into OpenVINO Intermediate Representation -(IR) format. - -Optimum CLI interface for converting models supports export to OpenVINO -(supported starting optimum-intel 1.12 version). General command format: - -.. code:: bash - - optimum-cli export openvino --model --task - -where ``--model`` argument is model id from HuggingFace Hub or local -directory with model (saved using ``.save_pretrained`` method), -``--task`` is one of `supported -task `__ -that exported model should solve. For LLMs it will be -``text-generation-with-past``. If model initialization requires to use -remote code, ``--trust-remote-code`` flag additionally should be passed. -Full list of supported arguments available via ``--help`` For more -details and examples of usage, please check `optimum -documentation `__. - -Compress model weights ----------------------- - - - -The Weights Compression algorithm is aimed at compressing the weights of -the models and can be used to optimize the model footprint and -performance of large models where the size of weights is relatively -larger than the size of activations, for example, Large Language Models -(LLM). Compared to INT8 compression, INT4 compression improves -performance even more but introduces a minor drop in prediction quality. - -Weights Compression using Optimum Intel CLI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Optimum Intel supports weight compression via NNCF out of the box. For -8-bit compression we pass ``--weight-format int8`` to ``optimum-cli`` -command line. For 4 bit compression we provide ``--weight-format int4`` -and some other options containing number of bits and other compression -parameters. An example of this approach usage you can find in -`llm-chatbot notebook `__ - -Weights Compression using NNCF -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -You also can perform weights compression for OpenVINO models using NNCF -directly. ``nncf.compress_weights`` function accepts the OpenVINO model -instance and compresses its weights for Linear and Embedding layers. We -will consider this variant in this notebook for both int4 and int8 -compression. - - **Note**: This tutorial involves conversion model for FP16 and - INT4/INT8 weights compression scenarios. It may be memory and - time-consuming in the first run. You can manually control the - compression precision below. **Note**: There may be no speedup for - INT4/INT8 compressed models on dGPU - -.. code:: ipython3 - - from IPython.display import display, Markdown - - prepare_int4_model = widgets.Checkbox( - value=True, - description="Prepare INT4 model", - disabled=False, - ) - prepare_int8_model = widgets.Checkbox( - value=False, - description="Prepare INT8 model", - disabled=False, - ) - prepare_fp16_model = widgets.Checkbox( - value=False, - description="Prepare FP16 model", - disabled=False, - ) - - display(prepare_int4_model) - display(prepare_int8_model) - display(prepare_fp16_model) - - - -.. parsed-literal:: - - Checkbox(value=True, description='Prepare INT4 model') - - - -.. parsed-literal:: - - Checkbox(value=False, description='Prepare INT8 model') - - - -.. parsed-literal:: - - Checkbox(value=False, description='Prepare FP16 model') - - -.. code:: ipython3 - - from pathlib import Path - import logging - import openvino as ov - import nncf - - nncf.set_log_level(logging.ERROR) - - pt_model_id = model_configuration["model_id"] - fp16_model_dir = Path(model_id.value) / "FP16" - int8_model_dir = Path(model_id.value) / "INT8_compressed_weights" - int4_model_dir = Path(model_id.value) / "INT4_compressed_weights" - - core = ov.Core() - - - def convert_to_fp16(): - if (fp16_model_dir / "openvino_model.xml").exists(): - return - export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format fp16".format(pt_model_id) - export_command = export_command_base + " " + str(fp16_model_dir) - display(Markdown("**Export command:**")) - display(Markdown(f"`{export_command}`")) - ! $export_command - - - def convert_to_int8(): - if (int8_model_dir / "openvino_model.xml").exists(): - return - int8_model_dir.mkdir(parents=True, exist_ok=True) - export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format int8".format(pt_model_id) - export_command = export_command_base + " " + str(int8_model_dir) - display(Markdown("**Export command:**")) - display(Markdown(f"`{export_command}`")) - ! $export_command - - - def convert_to_int4(): - compression_configs = { - "mistral-7b": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "red-pajama-3b-instruct": { - "sym": False, - "group_size": 128, - "ratio": 0.5, - }, - "dolly-v2-3b": {"sym": False, "group_size": 32, "ratio": 0.5}, - "llama-3-8b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "default": { - "sym": False, - "group_size": 128, - "ratio": 0.8, - }, - } - - model_compression_params = compression_configs.get(model_id.value, compression_configs["default"]) - if (int4_model_dir / "openvino_model.xml").exists(): - return - export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format int4".format(pt_model_id) - int4_compression_args = " --group-size {} --ratio {}".format(model_compression_params["group_size"], model_compression_params["ratio"]) - if model_compression_params["sym"]: - int4_compression_args += " --sym" - export_command_base += int4_compression_args - export_command = export_command_base + " " + str(int4_model_dir) - display(Markdown("**Export command:**")) - display(Markdown(f"`{export_command}`")) - ! $export_command - - - if prepare_fp16_model.value: - convert_to_fp16() - if prepare_int8_model.value: - convert_to_int8() - if prepare_int4_model.value: - convert_to_int4() - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino - - -Let’s compare model size for different compression types - -.. code:: ipython3 - - fp16_weights = fp16_model_dir / "openvino_model.bin" - int8_weights = int8_model_dir / "openvino_model.bin" - int4_weights = int4_model_dir / "openvino_model.bin" - - if fp16_weights.exists(): - print(f"Size of FP16 model is {fp16_weights.stat().st_size / 1024 / 1024:.2f} MB") - for precision, compressed_weights in zip([8, 4], [int8_weights, int4_weights]): - if compressed_weights.exists(): - print(f"Size of model with INT{precision} compressed weights is {compressed_weights.stat().st_size / 1024 / 1024:.2f} MB") - if compressed_weights.exists() and fp16_weights.exists(): - print(f"Compression rate for INT{precision} model: {fp16_weights.stat().st_size / compressed_weights.stat().st_size:.3f}") - - -.. parsed-literal:: - - Size of FP16 model is 5297.21 MB - Size of model with INT8 compressed weights is 2656.29 MB - Compression rate for INT8 model: 1.994 - Size of model with INT4 compressed weights is 2154.54 MB - Compression rate for INT4 model: 2.459 - - -Select device for inference and model variant ---------------------------------------------- - - - - **Note**: There may be no speedup for INT4/INT8 compressed models on - dGPU. - -.. code:: ipython3 - - core = ov.Core() - - device = device_widget("CPU", exclude=["NPU"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -.. code:: ipython3 - - available_models = [] - if int4_model_dir.exists(): - available_models.append("INT4") - if int8_model_dir.exists(): - available_models.append("INT8") - if fp16_model_dir.exists(): - available_models.append("FP16") - - model_to_run = widgets.Dropdown( - options=available_models, - value=available_models[0], - description="Model to run:", - disabled=False, - ) - - model_to_run - - - - -.. parsed-literal:: - - Dropdown(description='Model to run:', options=('INT4', 'INT8', 'FP16'), value='INT4') - - - -.. code:: ipython3 - - from transformers import AutoTokenizer - from openvino_tokenizers import convert_tokenizer - - if model_to_run.value == "INT4": - model_dir = int4_model_dir - elif model_to_run.value == "INT8": - model_dir = int8_model_dir - else: - model_dir = fp16_model_dir - print(f"Loading model from {model_dir}") - - # optionally convert tokenizer if used cached model without it - if not (model_dir / "openvino_tokenizer.xml").exists() or not (model_dir / "openvino_detokenizer.xml").exists(): - hf_tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) - ov_tokenizer, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True) - ov.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml") - ov.save_model(ov_tokenizer, model_dir / "openvino_detokenizer.xml") - - -.. parsed-literal:: - - Loading model from dolly-v2-3b/INT8_compressed_weights - - -Create an instruction-following inference pipeline --------------------------------------------------- - - - -The ``run_generation`` function accepts user-provided text input, -tokenizes it, and runs the generation process. Text generation is an -iterative process, where each next token depends on previously generated -until a maximum number of tokens or stop generation condition is not -reached. - -The diagram below illustrates how the instruction-following pipeline -works - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/e881f4a4-fcc8-427a-afe1-7dd80aebd66e - :alt: generation pipeline) - - generation pipeline) - -As can be seen, on the first iteration, the user provided instructions. -Instructions is converted to token ids using a tokenizer, then prepared -input provided to the model. The model generates probabilities for all -tokens in logits format. The way the next token will be selected over -predicted probabilities is driven by the selected decoding methodology. -You can find more information about the most popular decoding methods in -this `blog `__. - -To simplify user experience we will use `OpenVINO Generate -API `__. -Firstly we will create pipeline with ``LLMPipeline``. ``LLMPipeline`` is -the main object used for decoding. You can construct it straight away -from the folder with the converted model. It will automatically load the -``main model``, ``tokenizer``, ``detokenizer`` and default -``generation configuration``. After that we will configure parameters -for decoding. We can get default config with -``get_generation_config()``, setup parameters and apply the updated -version with ``set_generation_config(config)`` or put config directly to -``generate()``. It’s also possible to specify the needed options just as -inputs in the ``generate()`` method, as shown below. Then we just run -``generate`` method and get the output in text format. We do not need to -encode input prompt according to model expected template or write -post-processing code for logits decoder, it will be done easily with -LLMPipeline. - -To obtain intermediate generation results without waiting until when -generation is finished, we will write class-iterator based on -``StreamerBase`` class of ``openvino_genai``. - -.. code:: ipython3 - - import openvino_genai as ov_genai - - pipe = ov_genai.LLMPipeline(model_dir.as_posix(), device.value) - print(pipe.generate("The Sun is yellow bacause", temperature=1.2, top_k=4, do_sample=True, max_new_tokens=150)) - - -.. parsed-literal:: - - of the presence of chlorophyll - in its leaves. Chlorophyll absorbs all - visible sunlight and this causes it to - turn from a green to yellow colour. - The Sun is yellow bacause of the presence of chlorophyll in its leaves. Chlorophyll absorbs all - visible sunlight and this causes it to - turn from a green to yellow colour. - The yellow colour of the Sun is the - colour we perceive as the colour of the - sun. It also causes us to perceive the - sun as yellow. This property is called - the yellow colouration of the Sun and it - is caused by the presence of chlorophyll - in the leaves of plants. - Chlorophyll is also responsible for the green colour of plants - - -There are several parameters that can control text generation quality: - -- | ``Temperature`` is a parameter used to control the level of - creativity in AI-generated text. By adjusting the ``temperature``, - you can influence the AI model’s probability distribution, making - the text more focused or diverse. - | Consider the following example: The AI model has to complete the - sentence “The cat is \____.” with the following token - probabilities: - - | playing: 0.5 - | sleeping: 0.25 - | eating: 0.15 - | driving: 0.05 - | flying: 0.05 - - - **Low temperature** (e.g., 0.2): The AI model becomes more focused - and deterministic, choosing tokens with the highest probability, - such as “playing.” - - **Medium temperature** (e.g., 1.0): The AI model maintains a - balance between creativity and focus, selecting tokens based on - their probabilities without significant bias, such as “playing,” - “sleeping,” or “eating.” - - **High temperature** (e.g., 2.0): The AI model becomes more - adventurous, increasing the chances of selecting less likely - tokens, such as “driving” and “flying.” - -- ``Top-p``, also known as nucleus sampling, is a parameter used to - control the range of tokens considered by the AI model based on their - cumulative probability. By adjusting the ``top-p`` value, you can - influence the AI model’s token selection, making it more focused or - diverse. Using the same example with the cat, consider the following - top_p settings: - - - **Low top_p** (e.g., 0.5): The AI model considers only tokens with - the highest cumulative probability, such as “playing.” - - **Medium top_p** (e.g., 0.8): The AI model considers tokens with a - higher cumulative probability, such as “playing,” “sleeping,” and - “eating.” - - **High top_p** (e.g., 1.0): The AI model considers all tokens, - including those with lower probabilities, such as “driving” and - “flying.” - -- ``Top-k`` is another popular sampling strategy. In comparison with - Top-P, which chooses from the smallest possible set of words whose - cumulative probability exceeds the probability P, in Top-K sampling K - most likely next words are filtered and the probability mass is - redistributed among only those K next words. In our example with cat, - if k=3, then only “playing”, “sleeping” and “eating” will be taken - into account as possible next word. - -The generation cycle repeats until the end of the sequence token is -reached or it also can be interrupted when maximum tokens will be -generated. As already mentioned before, we can enable printing current -generated tokens without waiting until when the whole generation is -finished using Streaming API, it adds a new token to the output queue -and then prints them when they are ready. - -Setup imports -~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from threading import Thread - from time import perf_counter - from typing import List - import numpy as np - from queue import Queue - import re - -Prepare text streamer to get results runtime -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Load the ``detokenizer``, use it to convert token_id to string output -format. We will collect print-ready text in a queue and give the text -when it is needed. It will help estimate performance. - -.. code:: ipython3 - - core = ov.Core() - - detokinizer_dir = Path(model_dir, "openvino_detokenizer.xml") - - - class TextIteratorStreamer(ov_genai.StreamerBase): - def __init__(self, tokenizer): - super().__init__() - self.tokenizer = tokenizer - self.compiled_detokenizer = core.compile_model(detokinizer_dir.as_posix()) - self.text_queue = Queue() - self.stop_signal = None - - def __iter__(self): - return self - - def __next__(self): - value = self.text_queue.get() - if value == self.stop_signal: - raise StopIteration() - else: - return value - - def put(self, token_id): - openvino_output = self.compiled_detokenizer(np.array([[token_id]], dtype=int)) - text = str(openvino_output["string_output"][0]) - # remove labels/special symbols - text = text.lstrip("!") - text = re.sub("<.*>", "", text) - self.text_queue.put(text) - - def end(self): - self.text_queue.put(self.stop_signal) - -Main generation function -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -As it was discussed above, ``run_generation`` function is the entry -point for starting generation. It gets provided input instruction as -parameter and returns model response. - -.. code:: ipython3 - - def run_generation( - user_text: str, - top_p: float, - temperature: float, - top_k: int, - max_new_tokens: int, - perf_text: str, - ): - """ - Text generation function - - Parameters: - user_text (str): User-provided instruction for a generation. - top_p (float): Nucleus sampling. If set to < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for a generation. - temperature (float): The value used to module the logits distribution. - top_k (int): The number of highest probability vocabulary tokens to keep for top-k-filtering. - max_new_tokens (int): Maximum length of generated sequence. - perf_text (str): Content of text field for printing performance results. - Returns: - model_output (str) - model-generated text - perf_text (str) - updated perf text filed content - """ - - # setup config for decoding stage - config = pipe.get_generation_config() - config.temperature = temperature - if top_k > 0: - config.top_k = top_k - config.top_p = top_p - config.do_sample = True - config.max_new_tokens = max_new_tokens - - # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer - # in the main thread. - streamer = TextIteratorStreamer(pipe.get_tokenizer()) - t = Thread(target=pipe.generate, args=(user_text, config, streamer)) - t.start() - - model_output = "" - per_token_time = [] - num_tokens = 0 - start = perf_counter() - for new_text in streamer: - current_time = perf_counter() - start - model_output += new_text - perf_text, num_tokens = estimate_latency(current_time, perf_text, per_token_time, num_tokens) - yield model_output, perf_text - start = perf_counter() - return model_output, perf_text - -Helpers for application -~~~~~~~~~~~~~~~~~~~~~~~ - - - -For making interactive user interface we will use Gradio library. The -code bellow provides useful functions used for communication with UI -elements. - -.. code:: ipython3 - - def estimate_latency( - current_time: float, - current_perf_text: str, - per_token_time: List[float], - num_tokens: int, - ): - """ - Helper function for performance estimation - - Parameters: - current_time (float): This step time in seconds. - current_perf_text (str): Current content of performance UI field. - per_token_time (List[float]): history of performance from previous steps. - num_tokens (int): Total number of generated tokens. - - Returns: - update for performance text field - update for a total number of tokens - """ - num_tokens += 1 - per_token_time.append(1 / current_time) - if len(per_token_time) > 10 and len(per_token_time) % 4 == 0: - current_bucket = per_token_time[:-10] - return ( - f"Average generation speed: {np.mean(current_bucket):.2f} tokens/s. Total generated tokens: {num_tokens}", - num_tokens, - ) - return current_perf_text, num_tokens - -Run instruction-following pipeline ----------------------------------- - - - -Now, we are ready to explore model capabilities. This demo provides a -simple interface that allows communication with a model using text -instruction. Type your instruction into the ``User instruction`` field -or select one from predefined examples and click on the ``Submit`` -button to start generation. Additionally, you can modify advanced -generation parameters: - -- ``Device`` - allows switching inference device. Please note, every - time when new device is selected, model will be recompiled and this - takes some time. -- ``Max New Tokens`` - maximum size of generated text. -- ``Top-p (nucleus sampling)`` - if set to < 1, only the smallest set - of most probable tokens with probabilities that add up to top_p or - higher are kept for a generation. -- ``Top-k`` - the number of highest probability vocabulary tokens to - keep for top-k-filtering. -- ``Temperature`` - the value used to module the logits distribution. - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llm-question-answering/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(run_fn=run_generation, title=f"Question Answering with {model_id.value} and OpenVINO") - - try: - demo.queue().launch(height=800) - except Exception: - demo.queue().launch(share=True, height=800) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ - -.. code:: ipython3 - - # please uncomment and run this cell for stopping gradio interface - # demo.close() diff --git a/docs/notebooks/llm-rag-langchain-genai-with-output.rst b/docs/notebooks/llm-rag-langchain-genai-with-output.rst deleted file mode 100644 index 947357064f46af..00000000000000 --- a/docs/notebooks/llm-rag-langchain-genai-with-output.rst +++ /dev/null @@ -1,1374 +0,0 @@ -Create a RAG system using OpenVINO GenAI and LangChain -====================================================== - -**Retrieval-augmented generation (RAG)** is a technique for augmenting -LLM knowledge with additional, often private or real-time, data. LLMs -can reason about wide-ranging topics, but their knowledge is limited to -the public data up to a specific point in time that they were trained -on. If you want to build AI applications that can reason about private -data or data introduced after a model’s cutoff date, you need to augment -the knowledge of the model with the specific information it needs. The -process of bringing the appropriate information and inserting it into -the model prompt is known as Retrieval Augmented Generation (RAG). - -`LangChain `__ -is a framework for developing applications powered by language models. -It has a number of components specifically designed to help build RAG -applications. In this tutorial, we’ll build a simple question-answering -application over a text data source. - -The tutorial consists of the following steps: - -- Install prerequisites -- Download and convert the model from a public source using the - `OpenVINO integration with Hugging Face - Optimum `__. -- Compress model weights to 4-bit or 8-bit data types using - `NNCF `__ -- Create a RAG chain pipeline with `OpenVINO - GenAI `__ -- Run Q&A pipeline - -In this example, the customized RAG pipeline consists of following -components in order, where embedding, rerank and LLM will be deployed -with `OpenVINO -GenAI `__ to optimize -their inference performance. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/91237924/0076f6c7-75e4-4c2e-9015-87b355e5ca28 - :alt: RAG - - RAG - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Select model for inference <#select-model-for-inference>`__ -- `login to huggingfacehub to get access to pretrained - model <#login-to-huggingfacehub-to-get-access-to-pretrained-model>`__ -- `Convert model and compress model - weights <#convert-model-and-compress-model-weights>`__ - - - `LLM conversion and Weights Compression using - Optimum-CLI <#llm-conversion-and-weights-compression-using-optimum-cli>`__ - - - `Weight compression with AWQ <#weight-compression-with-awq>`__ - - - `Convert embedding model using - Optimum-CLI <#convert-embedding-model-using-optimum-cli>`__ - - `Convert rerank model using - Optimum-CLI <#convert-rerank-model-using-optimum-cli>`__ - -- `Select device for inference and model - variant <#select-device-for-inference-and-model-variant>`__ - - - `Select device for embedding model - inference <#select-device-for-embedding-model-inference>`__ - - `Select device for rerank model - inference <#select-device-for-rerank-model-inference>`__ - - `Select device for LLM model - inference <#select-device-for-llm-model-inference>`__ - -- `Load model <#load-model>`__ - - - `Load embedding model <#load-embedding-model>`__ - - `Load rerank model <#load-rerank-model>`__ - - `Load LLM model <#load-llm-model>`__ - -- `Run QA over Document <#run-qa-over-document>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Install required dependencies - -.. code:: ipython3 - - import os - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - with open("notebook_utils.py", "w") as f: - f.write(r.text) - - if not Path("pip_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/pip_helper.py", - ) - open("pip_helper.py", "w").write(r.text) - - from pip_helper import pip_install - - os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - - pip_install("--pre", "-U", "openvino>=2024.2.0", "--extra-index-url", "https://storage.openvinotoolkit.org/simple/wheels/nightly") - pip_install("--pre", "-U", "openvino-tokenizers[transformers]", "--extra-index-url", "https://storage.openvinotoolkit.org/simple/wheels/nightly") - pip_install("--pre", "-U", "openvino_genai", "--extra-index-url", "https://storage.openvinotoolkit.org/simple/wheels/nightly") - pip_install( - "-q", - "--extra-index-url", - "https://download.pytorch.org/whl/cpu", - "git+https://github.com/huggingface/optimum-intel.git", - "git+https://github.com/openvinotoolkit/nncf.git", - "datasets", - "accelerate", - "gradio>=4.19", - "onnx<1.16.2", - "einops", - "transformers_stream_generator", - "tiktoken", - "transformers>=4.43.1", - "faiss-cpu", - "sentence_transformers", - "langchain>=0.2.0", - "langchain-community>=0.2.15", - "langchainhub", - "unstructured", - "scikit-learn", - "python-docx", - "pypdf", - "huggingface-hub>=0.26.5", - ) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("llm-rag-langchain-genai.ipynb") - -.. code:: ipython3 - - import os - from pathlib import Path - import requests - import shutil - import io - - # fetch model configuration - - config_shared_path = Path("../../utils/llm_config.py") - config_dst_path = Path("llm_config.py") - text_example_en_path = Path("text_example_en.pdf") - text_example_cn_path = Path("text_example_cn.pdf") - text_example_en = "https://github.com/openvinotoolkit/openvino_notebooks/files/15039728/Platform.Brief_Intel.vPro.with.Intel.Core.Ultra_Final.pdf" - text_example_cn = "https://github.com/openvinotoolkit/openvino_notebooks/files/15039713/Platform.Brief_Intel.vPro.with.Intel.Core.Ultra_Final_CH.pdf" - - if not config_dst_path.exists(): - if config_shared_path.exists(): - try: - os.symlink(config_shared_path, config_dst_path) - except Exception: - shutil.copy(config_shared_path, config_dst_path) - else: - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py") - with open("llm_config.py", "w", encoding="utf-8") as f: - f.write(r.text) - elif not os.path.islink(config_dst_path): - print("LLM config will be updated") - if config_shared_path.exists(): - shutil.copy(config_shared_path, config_dst_path) - else: - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py") - with open("llm_config.py", "w", encoding="utf-8") as f: - f.write(r.text) - - if not text_example_en_path.exists(): - r = requests.get(url=text_example_en) - content = io.BytesIO(r.content) - with open("text_example_en.pdf", "wb") as f: - f.write(content.read()) - - if not text_example_cn_path.exists(): - r = requests.get(url=text_example_cn) - content = io.BytesIO(r.content) - with open("text_example_cn.pdf", "wb") as f: - f.write(content.read()) - -Select model for inference --------------------------- - - - -The tutorial supports different models, you can select one from the -provided options to compare the quality of open source LLM solutions. - - **Note**: conversion of some models can require additional actions - from user side and at least 64GB RAM for conversion. - -The available embedding model options are: - -- `bge-small-en-v1.5 `__ -- `bge-small-zh-v1.5 `__ -- `bge-large-en-v1.5 `__ -- `bge-large-zh-v1.5 `__ -- `bge-m3 `__ - -BGE embedding is a general Embedding Model. The model is pre-trained -using RetroMAE and trained on large-scale pair data using contrastive -learning. - -The available rerank model options are: - -- `bge-reranker-v2-m3 `__ -- `bge-reranker-large `__ -- `bge-reranker-base `__ - -Reranker model with cross-encoder will perform full-attention over the -input pair, which is more accurate than embedding model (i.e., -bi-encoder) but more time-consuming than embedding model. Therefore, it -can be used to re-rank the top-k documents returned by embedding model. - -You can also find available LLM model options in -`llm-chatbot `__ notebook. - -.. code:: ipython3 - - from pathlib import Path - import ipywidgets as widgets - -Convert model and compress model weights ----------------------------------------- - - - -The Weights Compression algorithm is aimed at compressing the weights of -the models and can be used to optimize the model footprint and -performance of large models where the size of weights is relatively -larger than the size of activations, for example, Large Language Models -(LLM). Compared to INT8 compression, INT4 compression improves -performance even more, but introduces a minor drop in prediction -quality. - -.. code:: ipython3 - - from llm_config import ( - SUPPORTED_EMBEDDING_MODELS, - SUPPORTED_RERANK_MODELS, - SUPPORTED_LLM_MODELS, - ) - - model_languages = list(SUPPORTED_LLM_MODELS) - - model_language = widgets.Dropdown( - options=model_languages, - value=model_languages[0], - description="Model Language:", - disabled=False, - ) - - model_language - - - - -.. parsed-literal:: - - Dropdown(description='Model Language:', options=('English', 'Chinese', 'Japanese'), value='English') - - - -.. code:: ipython3 - - llm_model_ids = [model_id for model_id, model_config in SUPPORTED_LLM_MODELS[model_language.value].items() if model_config.get("rag_prompt_template")] - - llm_model_id = widgets.Dropdown( - options=llm_model_ids, - value=llm_model_ids[-1], - description="Model:", - disabled=False, - ) - - llm_model_id - - - - -.. parsed-literal:: - - Dropdown(description='Model:', index=19, options=('tiny-llama-1b-chat', 'llama-3.2-1b-instruct', 'llama-3.2-3b… - - - -.. code:: ipython3 - - llm_model_configuration = SUPPORTED_LLM_MODELS[model_language.value][llm_model_id.value] - print(f"Selected LLM model {llm_model_id.value}") - - -.. parsed-literal:: - - Selected LLM model phi-3-mini-instruct - - -`Optimum Intel `__ is -the interface between the -`Transformers `__ and -`Diffusers `__ libraries -and OpenVINO to accelerate end-to-end pipelines on Intel architectures. -It provides ease-to-use cli interface for exporting models to `OpenVINO -Intermediate Representation -(IR) `__ -format. - -The command bellow demonstrates basic command for model export with -``optimum-cli`` - -:: - - optimum-cli export openvino --model --task - -where ``--model`` argument is model id from HuggingFace Hub or local -directory with model (saved using ``.save_pretrained`` method), -``--task`` is one of `supported -task `__ -that exported model should solve. For LLMs it will be -``text-generation-with-past``. If model initialization requires to use -remote code, ``--trust-remote-code`` flag additionally should be passed. - -LLM conversion and Weights Compression using Optimum-CLI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -You can also apply fp16, 8-bit or 4-bit weight compression on the -Linear, Convolutional and Embedding layers when exporting your model -with the CLI by setting ``--weight-format`` to respectively fp16, int8 -or int4. This type of optimization allows to reduce the memory footprint -and inference latency. By default the quantization scheme for int8/int4 -will be -`asymmetric `__, -to make it -`symmetric `__ -you can add ``--sym``. - -For INT4 quantization you can also specify the following arguments : - -- The ``--group-size`` parameter will define the group size to use for - quantization, -1 it will results in per-column quantization. -- The ``--ratio`` parameter controls the ratio between 4-bit and 8-bit - quantization. If set to 0.9, it means that 90% of the layers will be - quantized to int4 while 10% will be quantized to int8. - -Smaller group_size and ratio values usually improve accuracy at the -sacrifice of the model size and inference latency. - - **Note**: There may be no speedup for INT4/INT8 compressed models on - dGPU. - -.. code:: ipython3 - - from IPython.display import Markdown, display - - prepare_int4_model = widgets.Checkbox( - value=True, - description="Prepare INT4 model", - disabled=False, - ) - prepare_int8_model = widgets.Checkbox( - value=False, - description="Prepare INT8 model", - disabled=False, - ) - prepare_fp16_model = widgets.Checkbox( - value=False, - description="Prepare FP16 model", - disabled=False, - ) - - display(prepare_int4_model) - display(prepare_int8_model) - display(prepare_fp16_model) - - - -.. parsed-literal:: - - Checkbox(value=True, description='Prepare INT4 model') - - - -.. parsed-literal:: - - Checkbox(value=False, description='Prepare INT8 model') - - - -.. parsed-literal:: - - Checkbox(value=False, description='Prepare FP16 model') - - -Weight compression with AWQ -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -`Activation-aware Weight -Quantization `__ (AWQ) is an algorithm -that tunes model weights for more accurate INT4 compression. It slightly -improves generation quality of compressed LLMs, but requires significant -additional time for tuning weights on a calibration dataset. We use -``wikitext-2-raw-v1/train`` subset of the -`Wikitext `__ -dataset for calibration. - -Below you can enable AWQ to be additionally applied during model export -with INT4 precision. - - **Note**: Applying AWQ requires significant memory and time. - -.. - - **Note**: It is possible that there will be no matching patterns in - the model to apply AWQ, in such case it will be skipped. - -.. code:: ipython3 - - enable_awq = widgets.Checkbox( - value=False, - description="Enable AWQ", - disabled=not prepare_int4_model.value, - ) - display(enable_awq) - - - -.. parsed-literal:: - - Checkbox(value=False, description='Enable AWQ') - - -.. code:: ipython3 - - pt_model_id = llm_model_configuration["model_id"] - pt_model_name = llm_model_id.value.split("-")[0] - fp16_model_dir = Path(llm_model_id.value) / "FP16" - int8_model_dir = Path(llm_model_id.value) / "INT8_compressed_weights" - int4_model_dir = Path(llm_model_id.value) / "INT4_compressed_weights" - - - def convert_to_fp16(): - if (fp16_model_dir / "openvino_model.xml").exists(): - return - remote_code = llm_model_configuration.get("remote_code", False) - export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format fp16".format(pt_model_id) - if remote_code: - export_command_base += " --trust-remote-code" - export_command = export_command_base + " " + str(fp16_model_dir) - display(Markdown("**Export command:**")) - display(Markdown(f"`{export_command}`")) - ! $export_command - - - def convert_to_int8(): - if (int8_model_dir / "openvino_model.xml").exists(): - return - int8_model_dir.mkdir(parents=True, exist_ok=True) - remote_code = llm_model_configuration.get("remote_code", False) - export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format int8".format(pt_model_id) - if remote_code: - export_command_base += " --trust-remote-code" - export_command = export_command_base + " " + str(int8_model_dir) - display(Markdown("**Export command:**")) - display(Markdown(f"`{export_command}`")) - ! $export_command - - - def convert_to_int4(): - compression_configs = { - "zephyr-7b-beta": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "mistral-7b": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "minicpm-2b-dpo": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "gemma-2b-it": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "notus-7b-v1": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "neural-chat-7b-v3-1": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "llama-2-chat-7b": { - "sym": True, - "group_size": 128, - "ratio": 0.8, - }, - "llama-3-8b-instruct": { - "sym": True, - "group_size": 128, - "ratio": 0.8, - }, - "gemma-7b-it": { - "sym": True, - "group_size": 128, - "ratio": 0.8, - }, - "chatglm2-6b": { - "sym": True, - "group_size": 128, - "ratio": 0.72, - }, - "qwen-7b-chat": {"sym": True, "group_size": 128, "ratio": 0.6}, - "red-pajama-3b-chat": { - "sym": False, - "group_size": 128, - "ratio": 0.5, - }, - "qwen2.5-7b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "qwen2.5-3b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "qwen2.5-14b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "qwen2.5-1.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "default": { - "sym": False, - "group_size": 128, - "ratio": 0.8, - }, - } - - model_compression_params = compression_configs.get(llm_model_id.value, compression_configs["default"]) - if (int4_model_dir / "openvino_model.xml").exists(): - return - remote_code = llm_model_configuration.get("remote_code", False) - export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format int4".format(pt_model_id) - int4_compression_args = " --group-size {} --ratio {}".format(model_compression_params["group_size"], model_compression_params["ratio"]) - if model_compression_params["sym"]: - int4_compression_args += " --sym" - if enable_awq.value: - int4_compression_args += " --awq --dataset wikitext2 --num-samples 128" - export_command_base += int4_compression_args - if remote_code: - export_command_base += " --trust-remote-code" - export_command = export_command_base + " " + str(int4_model_dir) - display(Markdown("**Export command:**")) - display(Markdown(f"`{export_command}`")) - ! $export_command - - - if prepare_fp16_model.value: - convert_to_fp16() - if prepare_int8_model.value: - convert_to_int8() - if prepare_int4_model.value: - convert_to_int4() - -Let’s compare model size for different compression types - -.. code:: ipython3 - - fp16_weights = fp16_model_dir / "openvino_model.bin" - int8_weights = int8_model_dir / "openvino_model.bin" - int4_weights = int4_model_dir / "openvino_model.bin" - - if fp16_weights.exists(): - print(f"Size of FP16 model is {fp16_weights.stat().st_size / 1024 / 1024:.2f} MB") - for precision, compressed_weights in zip([8, 4], [int8_weights, int4_weights]): - if compressed_weights.exists(): - print(f"Size of model with INT{precision} compressed weights is {compressed_weights.stat().st_size / 1024 / 1024:.2f} MB") - if compressed_weights.exists() and fp16_weights.exists(): - print(f"Compression rate for INT{precision} model: {fp16_weights.stat().st_size / compressed_weights.stat().st_size:.3f}") - - -.. parsed-literal:: - - Size of model with INT4 compressed weights is 2319.41 MB - - -Convert embedding model using Optimum-CLI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Since some embedding models can only support limited languages, we can -filter them out according the LLM you selected. - -.. code:: ipython3 - - embedding_model_id = list(SUPPORTED_EMBEDDING_MODELS[model_language.value]) - - embedding_model_id = widgets.Dropdown( - options=embedding_model_id, - value=embedding_model_id[0], - description="Embedding Model:", - disabled=False, - ) - - embedding_model_id - - - - -.. parsed-literal:: - - Dropdown(description='Embedding Model:', options=('bge-small-en-v1.5', 'bge-large-en-v1.5', 'bge-m3'), value='… - - - -.. code:: ipython3 - - embedding_model_configuration = SUPPORTED_EMBEDDING_MODELS[model_language.value][embedding_model_id.value] - print(f"Selected {embedding_model_id.value} model") - - -.. parsed-literal:: - - Selected bge-small-en-v1.5 model - - -OpenVINO embedding model and tokenizer can be exported by -``feature-extraction`` task with ``optimum-cli``. - -.. code:: ipython3 - - export_command_base = "optimum-cli export openvino --model {} --task feature-extraction".format(embedding_model_configuration["model_id"]) - export_command = export_command_base + " " + str(embedding_model_id.value) - - if not Path(embedding_model_id.value).exists(): - ! $export_command - -Convert rerank model using Optimum-CLI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - rerank_model_id = list(SUPPORTED_RERANK_MODELS) - - rerank_model_id = widgets.Dropdown( - options=rerank_model_id, - value=rerank_model_id[0], - description="Rerank Model:", - disabled=False, - ) - - rerank_model_id - - - - -.. parsed-literal:: - - Dropdown(description='Rerank Model:', options=('bge-reranker-v2-m3', 'bge-reranker-large', 'bge-reranker-base'… - - - -.. code:: ipython3 - - rerank_model_configuration = SUPPORTED_RERANK_MODELS[rerank_model_id.value] - print(f"Selected {rerank_model_id.value} model") - - -.. parsed-literal:: - - Selected bge-reranker-v2-m3 model - - -Since ``rerank`` model is sort of sentence classification task, its -OpenVINO IR and tokenizer can be exported by ``text-classification`` -task with ``optimum-cli``. - -.. code:: ipython3 - - export_command_base = "optimum-cli export openvino --model {} --task text-classification".format(rerank_model_configuration["model_id"]) - export_command = export_command_base + " " + str(rerank_model_id.value) - - if not Path(rerank_model_id.value).exists(): - ! $export_command - -Select device for inference and model variant ---------------------------------------------- - - - - **Note**: There may be no speedup for INT4/INT8 compressed models on - dGPU. - -Select device for embedding model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - embedding_device = device_widget() - - embedding_device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - print(f"Embedding model will be loaded to {embedding_device.value} device for text embedding") - - -.. parsed-literal:: - - Embedding model will be loaded to CPU device for text embedding - - -Optimize the BGE embedding model’s parameter precision when loading -model to NPU device. - -.. code:: ipython3 - - from notebook_utils import optimize_bge_embedding - - USING_NPU = embedding_device.value == "NPU" - - npu_embedding_dir = embedding_model_id.value + "-npu" - npu_embedding_path = Path(npu_embedding_dir) / "openvino_model.xml" - if USING_NPU and not Path(npu_embedding_dir).exists(): - shutil.copytree(embedding_model_id.value, npu_embedding_dir) - optimize_bge_embedding(Path(embedding_model_id.value) / "openvino_model.xml", npu_embedding_path) - -Select device for rerank model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - rerank_device = device_widget() - - rerank_device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - print(f"Rerenk model will be loaded to {rerank_device.value} device for text reranking") - - -.. parsed-literal:: - - Rerenk model will be loaded to CPU device for text reranking - - -Select device for LLM model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - llm_device = device_widget() - - llm_device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - print(f"LLM model will be loaded to {llm_device.value} device for response generation") - - -.. parsed-literal:: - - LLM model will be loaded to CPU device for response generation - - -Load models ------------ - - - -Load embedding model -~~~~~~~~~~~~~~~~~~~~ - - - -Now a Hugging Face embedding model can be supported by OpenVINO through -`OpenVINOEmbeddings `__ -and -`OpenVINOBgeEmbeddings `__\ classes -of LangChain. - -.. code:: ipython3 - - from langchain_community.embeddings import OpenVINOBgeEmbeddings - - embedding_model_name = npu_embedding_dir if USING_NPU else embedding_model_id.value - batch_size = 1 if USING_NPU else 4 - embedding_model_kwargs = {"device": embedding_device.value, "compile": False} - encode_kwargs = { - "mean_pooling": embedding_model_configuration["mean_pooling"], - "normalize_embeddings": embedding_model_configuration["normalize_embeddings"], - "batch_size": batch_size, - } - - embedding = OpenVINOBgeEmbeddings( - model_name_or_path=embedding_model_name, - model_kwargs=embedding_model_kwargs, - encode_kwargs=encode_kwargs, - ) - if USING_NPU: - embedding.ov_model.reshape(1, 512) - embedding.ov_model.compile() - - text = "This is a test document." - embedding_result = embedding.embed_query(text) - embedding_result[:3] - - - - -.. parsed-literal:: - - [-0.04169073328375816, 0.0668943002820015, 0.0083105294033885] - - - -Load rerank model -~~~~~~~~~~~~~~~~~ - - - -Now a Hugging Face embedding model can be supported by OpenVINO through -`OpenVINOReranker `__ -class of LangChain. - - **Note**: Rerank can be skipped in RAG. - -.. code:: ipython3 - - from langchain_community.document_compressors.openvino_rerank import OpenVINOReranker - - rerank_model_name = rerank_model_id.value - rerank_model_kwargs = {"device": rerank_device.value} - rerank_top_n = 2 - - reranker = OpenVINOReranker( - model_name_or_path=rerank_model_name, - model_kwargs=rerank_model_kwargs, - top_n=rerank_top_n, - ) - - -.. parsed-literal:: - - /home2/ethan/intel/openvino_notebooks/openvino_venv/lib/python3.10/site-packages/pydantic/_internal/_fields.py:132: UserWarning: Field "model_name_or_path" in OpenVINOReranker has conflict with protected namespace "model_". - - You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`. - warnings.warn( - /home2/ethan/intel/openvino_notebooks/openvino_venv/lib/python3.10/site-packages/pydantic/_internal/_fields.py:132: UserWarning: Field "model_kwargs" in OpenVINOReranker has conflict with protected namespace "model_". - - You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`. - warnings.warn( - - -Load LLM model -~~~~~~~~~~~~~~ - - - -``OpenVINOLLM`` is a llm wrapper of `openvino_genai -API `__ for -LangChain. Models can be loaded by specifying the model parameters using -the ``from_model_path`` method. If you have an Intel GPU or NPU, you can -specify ``deivce="GPU"`` or ``deivce="NPU"`` to run inference on it. - -.. code:: ipython3 - - available_models = [] - if int4_model_dir.exists(): - available_models.append("INT4") - if int8_model_dir.exists(): - available_models.append("INT8") - if fp16_model_dir.exists(): - available_models.append("FP16") - - model_to_run = widgets.Dropdown( - options=available_models, - value=available_models[0], - description="Model to run:", - disabled=False, - ) - - model_to_run - - - - -.. parsed-literal:: - - Dropdown(description='Model to run:', options=('INT4',), value='INT4') - - - -``OpenVINOLLM`` can support customized tokenizer for better accuracy -with some specific models. For example, the default OpenVINO tokenizer -can be replaced with a Transformer library based tokenizer through -following steps: - -.. code:: python - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(str(model_dir)) - llm = OpenVINOLLM.from_model_path( - model_path=str(model_dir), - device=llm_device.value, - tokenizer=tokenizer, - ) - -.. code:: ipython3 - - from ov_langchain_helper import OpenVINOLLM - - - if model_to_run.value == "INT4": - model_dir = int4_model_dir - elif model_to_run.value == "INT8": - model_dir = int8_model_dir - else: - model_dir = fp16_model_dir - print(f"Loading model from {model_dir}") - - - llm = OpenVINOLLM.from_model_path( - model_path=str(model_dir), - device=llm_device.value, - # tokenizer=tokenizer, - ) - - llm.config.max_new_tokens = 2 - llm.invoke("2 + 2 =") - - -.. parsed-literal:: - - /home2/ethan/intel/openvino_notebooks/openvino_venv/lib/python3.10/site-packages/pydantic/_internal/_fields.py:172: UserWarning: Field name "pipe" in "OpenVINOLLM" shadows an attribute in parent "LLM" - warnings.warn( - - -.. parsed-literal:: - - Loading model from phi-3-mini-instruct/INT4_compressed_weights - - - - -.. parsed-literal:: - - '4' - - - -Run QA over Document --------------------- - - - -Now, when model created, we can setup Chatbot interface using -`Gradio `__. - -A typical RAG application has two main components: - -- **Indexing**: a pipeline for ingesting data from a source and - indexing it. This usually happen offline. - -- **Retrieval and generation**: the actual RAG chain, which takes the - user query at run time and retrieves the relevant data from the - index, then passes that to the model. - -The most common full sequence from raw data to answer looks like: - -**Indexing** - -1. ``Load``: First we need to load our data. We’ll use DocumentLoaders - for this. -2. ``Split``: Text splitters break large Documents into smaller chunks. - This is useful both for indexing data and for passing it in to a - model, since large chunks are harder to search over and won’t in a - model’s finite context window. -3. ``Store``: We need somewhere to store and index our splits, so that - they can later be searched over. This is often done using a - VectorStore and Embeddings model. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/91237924/dfed2ba3-0c3a-4e0e-a2a7-01638730486a - :alt: Indexing pipeline - - Indexing pipeline - -**Retrieval and generation** - -1. ``Retrieve``: Given a user input, relevant splits are retrieved from - storage using a Retriever. -2. ``Generate``: A LLM produces an answer using a prompt that includes - the question and the retrieved data. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/91237924/f0545ddc-c0cd-4569-8c86-9879fdab105a - :alt: Retrieval and generation pipeline - - Retrieval and generation pipeline - -.. code:: ipython3 - - import re - from typing import List - from langchain.text_splitter import ( - CharacterTextSplitter, - RecursiveCharacterTextSplitter, - MarkdownTextSplitter, - ) - from langchain.document_loaders import ( - CSVLoader, - EverNoteLoader, - PyPDFLoader, - TextLoader, - UnstructuredEPubLoader, - UnstructuredHTMLLoader, - UnstructuredMarkdownLoader, - UnstructuredODTLoader, - UnstructuredPowerPointLoader, - UnstructuredWordDocumentLoader, - ) - - - class ChineseTextSplitter(CharacterTextSplitter): - def __init__(self, pdf: bool = False, **kwargs): - super().__init__(**kwargs) - self.pdf = pdf - - def split_text(self, text: str) -> List[str]: - if self.pdf: - text = re.sub(r"\n{3,}", "\n", text) - text = text.replace("\n\n", "") - sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') - sent_list = [] - for ele in sent_sep_pattern.split(text): - if sent_sep_pattern.match(ele) and sent_list: - sent_list[-1] += ele - elif ele: - sent_list.append(ele) - return sent_list - - - TEXT_SPLITERS = { - "Character": CharacterTextSplitter, - "RecursiveCharacter": RecursiveCharacterTextSplitter, - "Markdown": MarkdownTextSplitter, - "Chinese": ChineseTextSplitter, - } - - - LOADERS = { - ".csv": (CSVLoader, {}), - ".doc": (UnstructuredWordDocumentLoader, {}), - ".docx": (UnstructuredWordDocumentLoader, {}), - ".enex": (EverNoteLoader, {}), - ".epub": (UnstructuredEPubLoader, {}), - ".html": (UnstructuredHTMLLoader, {}), - ".md": (UnstructuredMarkdownLoader, {}), - ".odt": (UnstructuredODTLoader, {}), - ".pdf": (PyPDFLoader, {}), - ".ppt": (UnstructuredPowerPointLoader, {}), - ".pptx": (UnstructuredPowerPointLoader, {}), - ".txt": (TextLoader, {"encoding": "utf8"}), - } - - if model_language.value == "English": - text_example_path = "text_example_en.pdf" - else: - text_example_path = "text_example_cn.pdf" - -We can build a RAG pipeline of LangChain through -`create_retrieval_chain `__, -which will help to create a chain to connect RAG components including: - -- `Vector stores `__\ , -- `Retrievers `__ -- `LLM `__ -- `Embedding `__ - -.. code:: ipython3 - - from langchain.prompts import PromptTemplate - from langchain_community.vectorstores import FAISS - from langchain.chains.retrieval import create_retrieval_chain - from langchain.chains.combine_documents import create_stuff_documents_chain - from langchain.docstore.document import Document - from langchain.retrievers import ContextualCompressionRetriever - import gradio as gr - - rag_prompt_template = llm_model_configuration["rag_prompt_template"] - - - def load_single_document(file_path: str) -> List[Document]: - """ - helper for loading a single document - - Params: - file_path: document path - Returns: - documents loaded - - """ - ext = "." + file_path.rsplit(".", 1)[-1] - if ext in LOADERS: - loader_class, loader_args = LOADERS[ext] - loader = loader_class(file_path, **loader_args) - return loader.load() - - raise ValueError(f"File does not exist '{ext}'") - - - def default_partial_text_processor(partial_text: str, new_text: str): - """ - helper for updating partially generated answer, used by default - - Params: - partial_text: text buffer for storing previosly generated text - new_text: text update for the current step - Returns: - updated text string - - """ - partial_text += new_text - return partial_text - - - text_processor = llm_model_configuration.get("partial_text_processor", default_partial_text_processor) - - - def create_vectordb( - docs, spliter_name, chunk_size, chunk_overlap, vector_search_top_k, vector_rerank_top_n, run_rerank, search_method, score_threshold, progress=gr.Progress() - ): - """ - Initialize a vector database - - Params: - doc: orignal documents provided by user - spliter_name: spliter method - chunk_size: size of a single sentence chunk - chunk_overlap: overlap size between 2 chunks - vector_search_top_k: Vector search top k - vector_rerank_top_n: Search rerank top n - run_rerank: whether run reranker - search_method: top k search method - score_threshold: score threshold when selecting 'similarity_score_threshold' method - - """ - global db - global retriever - global combine_docs_chain - global rag_chain - - if vector_rerank_top_n > vector_search_top_k: - gr.Warning("Search top k must >= Rerank top n") - - documents = [] - for doc in docs: - if type(doc) is not str: - doc = doc.name - documents.extend(load_single_document(doc)) - - text_splitter = TEXT_SPLITERS[spliter_name](chunk_size=chunk_size, chunk_overlap=chunk_overlap) - - texts = text_splitter.split_documents(documents) - db = FAISS.from_documents(texts, embedding) - if search_method == "similarity_score_threshold": - search_kwargs = {"k": vector_search_top_k, "score_threshold": score_threshold} - else: - search_kwargs = {"k": vector_search_top_k} - retriever = db.as_retriever(search_kwargs=search_kwargs, search_type=search_method) - if run_rerank: - reranker.top_n = vector_rerank_top_n - retriever = ContextualCompressionRetriever(base_compressor=reranker, base_retriever=retriever) - prompt = PromptTemplate.from_template(rag_prompt_template) - combine_docs_chain = create_stuff_documents_chain(llm, prompt) - - rag_chain = create_retrieval_chain(retriever, combine_docs_chain) - - return "Vector database is Ready" - - - def update_retriever(vector_search_top_k, vector_rerank_top_n, run_rerank, search_method, score_threshold): - """ - Update retriever - - Params: - vector_search_top_k: Vector search top k - vector_rerank_top_n: Search rerank top n - run_rerank: whether run reranker - search_method: top k search method - score_threshold: score threshold when selecting 'similarity_score_threshold' method - - """ - global db - global retriever - global combine_docs_chain - global rag_chain - - if vector_rerank_top_n > vector_search_top_k: - gr.Warning("Search top k must >= Rerank top n") - - if search_method == "similarity_score_threshold": - search_kwargs = {"k": vector_search_top_k, "score_threshold": score_threshold} - else: - search_kwargs = {"k": vector_search_top_k} - retriever = db.as_retriever(search_kwargs=search_kwargs, search_type=search_method) - if run_rerank: - retriever = ContextualCompressionRetriever(base_compressor=reranker, base_retriever=retriever) - reranker.top_n = vector_rerank_top_n - rag_chain = create_retrieval_chain(retriever, combine_docs_chain) - - return "Vector database is Ready" - - - # initialize the vector store with example document - create_vectordb( - [text_example_path], - "RecursiveCharacter", - chunk_size=400, - chunk_overlap=50, - vector_search_top_k=10, - vector_rerank_top_n=2, - run_rerank=True, - search_method="similarity_score_threshold", - score_threshold=0.5, - ) - - - def bot(history, temperature, top_p, top_k, repetition_penalty, hide_full_prompt): - """ - callback function for running chatbot on submit button click - - Params: - message: new message from user - history: conversation history - temperature: parameter for control the level of creativity in AI-generated text. - By adjusting the `temperature`, you can influence the AI model's probability distribution, making the text more focused or diverse. - top_p: parameter for control the range of tokens considered by the AI model based on their cumulative probability. - top_k: parameter for control the range of tokens considered by the AI model based on their cumulative probability, selecting number of tokens with highest probability. - repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text. - active_chat: chat state, if true then chat is running, if false then we should start it here. - Returns: - message: reset message and make it "" - history: updated history with message and answer from chatbot - active_chat: if we are here, the chat is running or will be started, so return True - """ - - llm.config.temperature = temperature - llm.config.top_p = 1 - llm.config.top_k = 2 - llm.config.do_sample = temperature > 0.0 - llm.config.max_new_tokens = 1024 - llm.config.repetition_penalty = repetition_penalty - - partial_text = "" - streaming_response = rag_chain.stream({"input": history[-1][0]}) - for new_text in streaming_response: - if list(new_text.keys())[0] == "answer": - partial_text = text_processor(partial_text, list(new_text.values())[0]) - history[-1][1] = partial_text - yield history - -Next we can create a Gradio UI and run demo. - -.. code:: ipython3 - - streaming_response = rag_chain.stream({"input": "hi"}) - for new_text in streaming_response: - if list(new_text.keys())[0] == "answer": - print(list(new_text.values())[0], end="", flush=True) - - -.. parsed-literal:: - - No relevant docs were retrieved using the relevance score threshold 0.5 - - -.. parsed-literal:: - - Hello! - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llm-rag-langchain/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo( - load_doc_fn=create_vectordb, - run_fn=bot, - stop_fn=None, - update_retriever_fn=update_retriever, - model_name=llm_model_id.value, - language=model_language.value, - ) - - try: - demo.queue().launch() - except Exception: - demo.queue().launch(share=True) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ - -.. code:: ipython3 - - # please uncomment and run this cell for stopping gradio interface - # demo.close() diff --git a/docs/notebooks/llm-rag-langchain-with-output.rst b/docs/notebooks/llm-rag-langchain-with-output.rst deleted file mode 100644 index 56cd7eecbbb1d3..00000000000000 --- a/docs/notebooks/llm-rag-langchain-with-output.rst +++ /dev/null @@ -1,1441 +0,0 @@ -Create a RAG system using OpenVINO and LangChain -================================================ - -**Retrieval-augmented generation (RAG)** is a technique for augmenting -LLM knowledge with additional, often private or real-time, data. LLMs -can reason about wide-ranging topics, but their knowledge is limited to -the public data up to a specific point in time that they were trained -on. If you want to build AI applications that can reason about private -data or data introduced after a model’s cutoff date, you need to augment -the knowledge of the model with the specific information it needs. The -process of bringing the appropriate information and inserting it into -the model prompt is known as Retrieval Augmented Generation (RAG). - -`LangChain `__ -is a framework for developing applications powered by language models. -It has a number of components specifically designed to help build RAG -applications. In this tutorial, we’ll build a simple question-answering -application over a text data source. - -The tutorial consists of the following steps: - -- Install prerequisites -- Download and convert the model from a public source using the - `OpenVINO integration with Hugging Face - Optimum `__. -- Compress model weights to 4-bit or 8-bit data types using - `NNCF `__ -- Create a RAG chain pipeline -- Run Q&A pipeline - -In this example, the customized RAG pipeline consists of following -components in order, where embedding, rerank and LLM will be deployed -with OpenVINO to optimize their inference performance. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/91237924/0076f6c7-75e4-4c2e-9015-87b355e5ca28 - :alt: RAG - - RAG - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Select model for inference <#select-model-for-inference>`__ -- `login to huggingfacehub to get access to pretrained - model <#login-to-huggingfacehub-to-get-access-to-pretrained-model>`__ -- `Convert model and compress model - weights <#convert-model-and-compress-model-weights>`__ - - - `LLM conversion and Weights Compression using - Optimum-CLI <#llm-conversion-and-weights-compression-using-optimum-cli>`__ - - - `Weight compression with AWQ <#weight-compression-with-awq>`__ - - - `Convert embedding model using - Optimum-CLI <#convert-embedding-model-using-optimum-cli>`__ - - `Convert rerank model using - Optimum-CLI <#convert-rerank-model-using-optimum-cli>`__ - -- `Select device for inference and model - variant <#select-device-for-inference-and-model-variant>`__ - - - `Select device for embedding model - inference <#select-device-for-embedding-model-inference>`__ - - `Select device for rerank model - inference <#select-device-for-rerank-model-inference>`__ - - `Select device for LLM model - inference <#select-device-for-llm-model-inference>`__ - -- `Load model <#load-model>`__ - - - `Load embedding model <#load-embedding-model>`__ - - `Load rerank model <#load-rerank-model>`__ - - `Load LLM model <#load-llm-model>`__ - -- `Run QA over Document <#run-qa-over-document>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Install required dependencies - -.. code:: ipython3 - - import os - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - with open("notebook_utils.py", "w") as f: - f.write(r.text) - - if not Path("pip_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/pip_helper.py", - ) - open("pip_helper.py", "w").write(r.text) - - from pip_helper import pip_install - - os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - - pip_install("--pre", "-U", "openvino>=2024.2.0", "--extra-index-url", "https://storage.openvinotoolkit.org/simple/wheels/nightly") - pip_install("--pre", "-U", "openvino-tokenizers[transformers]", "--extra-index-url", "https://storage.openvinotoolkit.org/simple/wheels/nightly") - pip_install( - "-q", - "--extra-index-url", - "https://download.pytorch.org/whl/cpu", - "git+https://github.com/huggingface/optimum-intel.git", - "git+https://github.com/openvinotoolkit/nncf.git", - "datasets", - "accelerate", - "gradio>=4.19", - "onnx<1.16.2", - "einops", - "transformers_stream_generator", - "tiktoken", - "transformers>=4.43.1", - "faiss-cpu", - "sentence_transformers", - "langchain>=0.2.0", - "langchain-community>=0.2.15", - "langchainhub", - "unstructured", - "scikit-learn", - "python-docx", - "pypdf", - "huggingface-hub>=0.26.5", - ) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("llm-rag-langchain.ipynb") - -.. code:: ipython3 - - import os - from pathlib import Path - import requests - import shutil - import io - - # fetch model configuration - - config_shared_path = Path("../../utils/llm_config.py") - config_dst_path = Path("llm_config.py") - text_example_en_path = Path("text_example_en.pdf") - text_example_cn_path = Path("text_example_cn.pdf") - text_example_en = "https://github.com/openvinotoolkit/openvino_notebooks/files/15039728/Platform.Brief_Intel.vPro.with.Intel.Core.Ultra_Final.pdf" - text_example_cn = "https://github.com/openvinotoolkit/openvino_notebooks/files/15039713/Platform.Brief_Intel.vPro.with.Intel.Core.Ultra_Final_CH.pdf" - - if not config_dst_path.exists(): - if config_shared_path.exists(): - try: - os.symlink(config_shared_path, config_dst_path) - except Exception: - shutil.copy(config_shared_path, config_dst_path) - else: - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py") - with open("llm_config.py", "w", encoding="utf-8") as f: - f.write(r.text) - elif not os.path.islink(config_dst_path): - print("LLM config will be updated") - if config_shared_path.exists(): - shutil.copy(config_shared_path, config_dst_path) - else: - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py") - with open("llm_config.py", "w", encoding="utf-8") as f: - f.write(r.text) - - if not text_example_en_path.exists(): - r = requests.get(url=text_example_en) - content = io.BytesIO(r.content) - with open("text_example_en.pdf", "wb") as f: - f.write(content.read()) - - if not text_example_cn_path.exists(): - r = requests.get(url=text_example_cn) - content = io.BytesIO(r.content) - with open("text_example_cn.pdf", "wb") as f: - f.write(content.read()) - - -.. parsed-literal:: - - LLM config will be updated - - -Select model for inference --------------------------- - - - -The tutorial supports different models, you can select one from the -provided options to compare the quality of open source LLM solutions. - - **Note**: conversion of some models can require additional actions - from user side and at least 64GB RAM for conversion. - -The available embedding model options are: - -- `bge-small-en-v1.5 `__ -- `bge-small-zh-v1.5 `__ -- `bge-large-en-v1.5 `__ -- `bge-large-zh-v1.5 `__ -- `bge-m3 `__ - -BGE embedding is a general Embedding Model. The model is pre-trained -using RetroMAE and trained on large-scale pair data using contrastive -learning. - -The available rerank model options are: - -- `bge-reranker-v2-m3 `__ -- `bge-reranker-large `__ -- `bge-reranker-base `__ - -Reranker model with cross-encoder will perform full-attention over the -input pair, which is more accurate than embedding model (i.e., -bi-encoder) but more time-consuming than embedding model. Therefore, it -can be used to re-rank the top-k documents returned by embedding model. - -You can also find available LLM model options in -`llm-chatbot `__ notebook. - -.. code:: ipython3 - - from pathlib import Path - import torch - import ipywidgets as widgets - from transformers import ( - TextIteratorStreamer, - StoppingCriteria, - StoppingCriteriaList, - ) - -Convert model and compress model weights ----------------------------------------- - - - -The Weights Compression algorithm is aimed at compressing the weights of -the models and can be used to optimize the model footprint and -performance of large models where the size of weights is relatively -larger than the size of activations, for example, Large Language Models -(LLM). Compared to INT8 compression, INT4 compression improves -performance even more, but introduces a minor drop in prediction -quality. - -.. code:: ipython3 - - from llm_config import ( - SUPPORTED_EMBEDDING_MODELS, - SUPPORTED_RERANK_MODELS, - SUPPORTED_LLM_MODELS, - ) - - model_languages = list(SUPPORTED_LLM_MODELS) - - model_language = widgets.Dropdown( - options=model_languages, - value=model_languages[0], - description="Model Language:", - disabled=False, - ) - - model_language - - - - -.. parsed-literal:: - - Dropdown(description='Model Language:', options=('English', 'Chinese', 'Japanese'), value='English') - - - -.. code:: ipython3 - - llm_model_ids = [model_id for model_id, model_config in SUPPORTED_LLM_MODELS[model_language.value].items() if model_config.get("rag_prompt_template")] - - llm_model_id = widgets.Dropdown( - options=llm_model_ids, - value=llm_model_ids[-1], - description="Model:", - disabled=False, - ) - - llm_model_id - - - - -.. parsed-literal:: - - Dropdown(description='Model:', index=12, options=('tiny-llama-1b-chat', 'gemma-2b-it', 'red-pajama-3b-chat', '… - - - -.. code:: ipython3 - - llm_model_configuration = SUPPORTED_LLM_MODELS[model_language.value][llm_model_id.value] - print(f"Selected LLM model {llm_model_id.value}") - - -.. parsed-literal:: - - Selected LLM model phi-3-mini-instruct - - -`Optimum Intel `__ is -the interface between the -`Transformers `__ and -`Diffusers `__ libraries -and OpenVINO to accelerate end-to-end pipelines on Intel architectures. -It provides ease-to-use cli interface for exporting models to `OpenVINO -Intermediate Representation -(IR) `__ -format. - -The command bellow demonstrates basic command for model export with -``optimum-cli`` - -:: - - optimum-cli export openvino --model --task - -where ``--model`` argument is model id from HuggingFace Hub or local -directory with model (saved using ``.save_pretrained`` method), -``--task`` is one of `supported -task `__ -that exported model should solve. For LLMs it will be -``text-generation-with-past``. If model initialization requires to use -remote code, ``--trust-remote-code`` flag additionally should be passed. - -LLM conversion and Weights Compression using Optimum-CLI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -You can also apply fp16, 8-bit or 4-bit weight compression on the -Linear, Convolutional and Embedding layers when exporting your model -with the CLI by setting ``--weight-format`` to respectively fp16, int8 -or int4. This type of optimization allows to reduce the memory footprint -and inference latency. By default the quantization scheme for int8/int4 -will be -`asymmetric `__, -to make it -`symmetric `__ -you can add ``--sym``. - -For INT4 quantization you can also specify the following arguments : - -- The ``--group-size`` parameter will define the group size to use for - quantization, -1 it will results in per-column quantization. -- The ``--ratio`` parameter controls the ratio between 4-bit and 8-bit - quantization. If set to 0.9, it means that 90% of the layers will be - quantized to int4 while 10% will be quantized to int8. - -Smaller group_size and ratio values usually improve accuracy at the -sacrifice of the model size and inference latency. - - **Note**: There may be no speedup for INT4/INT8 compressed models on - dGPU. - -.. code:: ipython3 - - from IPython.display import Markdown, display - - prepare_int4_model = widgets.Checkbox( - value=True, - description="Prepare INT4 model", - disabled=False, - ) - prepare_int8_model = widgets.Checkbox( - value=False, - description="Prepare INT8 model", - disabled=False, - ) - prepare_fp16_model = widgets.Checkbox( - value=False, - description="Prepare FP16 model", - disabled=False, - ) - - display(prepare_int4_model) - display(prepare_int8_model) - display(prepare_fp16_model) - - - -.. parsed-literal:: - - Checkbox(value=True, description='Prepare INT4 model') - - - -.. parsed-literal:: - - Checkbox(value=False, description='Prepare INT8 model') - - - -.. parsed-literal:: - - Checkbox(value=False, description='Prepare FP16 model') - - -Weight compression with AWQ -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -`Activation-aware Weight -Quantization `__ (AWQ) is an algorithm -that tunes model weights for more accurate INT4 compression. It slightly -improves generation quality of compressed LLMs, but requires significant -additional time for tuning weights on a calibration dataset. We use -``wikitext-2-raw-v1/train`` subset of the -`Wikitext `__ -dataset for calibration. - -Below you can enable AWQ to be additionally applied during model export -with INT4 precision. - - **Note**: Applying AWQ requires significant memory and time. - -.. - - **Note**: It is possible that there will be no matching patterns in - the model to apply AWQ, in such case it will be skipped. - -.. code:: ipython3 - - enable_awq = widgets.Checkbox( - value=False, - description="Enable AWQ", - disabled=not prepare_int4_model.value, - ) - display(enable_awq) - - - -.. parsed-literal:: - - Checkbox(value=False, description='Enable AWQ') - - -.. code:: ipython3 - - pt_model_id = llm_model_configuration["model_id"] - pt_model_name = llm_model_id.value.split("-")[0] - fp16_model_dir = Path(llm_model_id.value) / "FP16" - int8_model_dir = Path(llm_model_id.value) / "INT8_compressed_weights" - int4_model_dir = Path(llm_model_id.value) / "INT4_compressed_weights" - - - def convert_to_fp16(): - if (fp16_model_dir / "openvino_model.xml").exists(): - return - remote_code = llm_model_configuration.get("remote_code", False) - export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format fp16".format(pt_model_id) - if remote_code: - export_command_base += " --trust-remote-code" - export_command = export_command_base + " " + str(fp16_model_dir) - display(Markdown("**Export command:**")) - display(Markdown(f"`{export_command}`")) - ! $export_command - - - def convert_to_int8(): - if (int8_model_dir / "openvino_model.xml").exists(): - return - int8_model_dir.mkdir(parents=True, exist_ok=True) - remote_code = llm_model_configuration.get("remote_code", False) - export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format int8".format(pt_model_id) - if remote_code: - export_command_base += " --trust-remote-code" - export_command = export_command_base + " " + str(int8_model_dir) - display(Markdown("**Export command:**")) - display(Markdown(f"`{export_command}`")) - ! $export_command - - - def convert_to_int4(): - compression_configs = { - "zephyr-7b-beta": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "mistral-7b": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "minicpm-2b-dpo": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "gemma-2b-it": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "notus-7b-v1": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "neural-chat-7b-v3-1": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "llama-2-chat-7b": { - "sym": True, - "group_size": 128, - "ratio": 0.8, - }, - "llama-3-8b-instruct": { - "sym": True, - "group_size": 128, - "ratio": 0.8, - }, - "gemma-7b-it": { - "sym": True, - "group_size": 128, - "ratio": 0.8, - }, - "chatglm2-6b": { - "sym": True, - "group_size": 128, - "ratio": 0.72, - }, - "qwen-7b-chat": {"sym": True, "group_size": 128, "ratio": 0.6}, - "red-pajama-3b-chat": { - "sym": False, - "group_size": 128, - "ratio": 0.5, - }, - "qwen2.5-7b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "qwen2.5-3b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "qwen2.5-14b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "qwen2.5-1.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "default": { - "sym": False, - "group_size": 128, - "ratio": 0.8, - }, - } - - model_compression_params = compression_configs.get(llm_model_id.value, compression_configs["default"]) - if (int4_model_dir / "openvino_model.xml").exists(): - return - remote_code = llm_model_configuration.get("remote_code", False) - export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format int4".format(pt_model_id) - int4_compression_args = " --group-size {} --ratio {}".format(model_compression_params["group_size"], model_compression_params["ratio"]) - if model_compression_params["sym"]: - int4_compression_args += " --sym" - if enable_awq.value: - int4_compression_args += " --awq --dataset wikitext2 --num-samples 128" - export_command_base += int4_compression_args - if remote_code: - export_command_base += " --trust-remote-code" - export_command = export_command_base + " " + str(int4_model_dir) - display(Markdown("**Export command:**")) - display(Markdown(f"`{export_command}`")) - ! $export_command - - - if prepare_fp16_model.value: - convert_to_fp16() - if prepare_int8_model.value: - convert_to_int8() - if prepare_int4_model.value: - convert_to_int4() - -Let’s compare model size for different compression types - -.. code:: ipython3 - - fp16_weights = fp16_model_dir / "openvino_model.bin" - int8_weights = int8_model_dir / "openvino_model.bin" - int4_weights = int4_model_dir / "openvino_model.bin" - - if fp16_weights.exists(): - print(f"Size of FP16 model is {fp16_weights.stat().st_size / 1024 / 1024:.2f} MB") - for precision, compressed_weights in zip([8, 4], [int8_weights, int4_weights]): - if compressed_weights.exists(): - print(f"Size of model with INT{precision} compressed weights is {compressed_weights.stat().st_size / 1024 / 1024:.2f} MB") - if compressed_weights.exists() and fp16_weights.exists(): - print(f"Compression rate for INT{precision} model: {fp16_weights.stat().st_size / compressed_weights.stat().st_size:.3f}") - - -.. parsed-literal:: - - Size of model with INT4 compressed weights is 2319.41 MB - - -Convert embedding model using Optimum-CLI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Since some embedding models can only support limited languages, we can -filter them out according the LLM you selected. - -.. code:: ipython3 - - embedding_model_id = list(SUPPORTED_EMBEDDING_MODELS[model_language.value]) - - embedding_model_id = widgets.Dropdown( - options=embedding_model_id, - value=embedding_model_id[0], - description="Embedding Model:", - disabled=False, - ) - - embedding_model_id - - - - -.. parsed-literal:: - - Dropdown(description='Embedding Model:', options=('bge-small-en-v1.5', 'bge-large-en-v1.5', 'bge-m3'), value='… - - - -.. code:: ipython3 - - embedding_model_configuration = SUPPORTED_EMBEDDING_MODELS[model_language.value][embedding_model_id.value] - print(f"Selected {embedding_model_id.value} model") - - -.. parsed-literal:: - - Selected bge-small-en-v1.5 model - - -OpenVINO embedding model and tokenizer can be exported by -``feature-extraction`` task with ``optimum-cli``. - -.. code:: ipython3 - - export_command_base = "optimum-cli export openvino --model {} --task feature-extraction".format(embedding_model_configuration["model_id"]) - export_command = export_command_base + " " + str(embedding_model_id.value) - - if not Path(embedding_model_id.value).exists(): - ! $export_command - -Convert rerank model using Optimum-CLI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - rerank_model_id = list(SUPPORTED_RERANK_MODELS) - - rerank_model_id = widgets.Dropdown( - options=rerank_model_id, - value=rerank_model_id[0], - description="Rerank Model:", - disabled=False, - ) - - rerank_model_id - - - - -.. parsed-literal:: - - Dropdown(description='Rerank Model:', options=('bge-reranker-v2-m3', 'bge-reranker-large', 'bge-reranker-base'… - - - -.. code:: ipython3 - - rerank_model_configuration = SUPPORTED_RERANK_MODELS[rerank_model_id.value] - print(f"Selected {rerank_model_id.value} model") - - -.. parsed-literal:: - - Selected bge-reranker-v2-m3 model - - -Since ``rerank`` model is sort of sentence classification task, its -OpenVINO IR and tokenizer can be exported by ``text-classification`` -task with ``optimum-cli``. - -.. code:: ipython3 - - export_command_base = "optimum-cli export openvino --model {} --task text-classification".format(rerank_model_configuration["model_id"]) - export_command = export_command_base + " " + str(rerank_model_id.value) - - if not Path(rerank_model_id.value).exists(): - ! $export_command - -Select device for inference and model variant ---------------------------------------------- - - - - **Note**: There may be no speedup for INT4/INT8 compressed models on - dGPU. - -Select device for embedding model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - embedding_device = device_widget() - - embedding_device - - -.. parsed-literal:: - - [ERROR] 03:22:19.719 [NPUBackends] Cannot find backend for inference. Make sure the device is available. - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - print(f"Embedding model will be loaded to {embedding_device.value} device for text embedding") - - -.. parsed-literal:: - - Embedding model will be loaded to AUTO device for text embedding - - -Optimize the BGE embedding model’s parameter precision when loading -model to NPU device. - -.. code:: ipython3 - - from notebook_utils import optimize_bge_embedding - - USING_NPU = embedding_device.value == "NPU" - - npu_embedding_dir = embedding_model_id.value + "-npu" - npu_embedding_path = Path(npu_embedding_dir) / "openvino_model.xml" - if USING_NPU and not Path(npu_embedding_dir).exists(): - shutil.copytree(embedding_model_id.value, npu_embedding_dir) - optimize_bge_embedding(Path(embedding_model_id.value) / "openvino_model.xml", npu_embedding_path) - -Select device for rerank model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - rerank_device = device_widget() - - rerank_device - - -.. parsed-literal:: - - [ERROR] 03:22:20.604 [NPUBackends] Cannot find backend for inference. Make sure the device is available. - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - print(f"Rerenk model will be loaded to {rerank_device.value} device for text reranking") - - -.. parsed-literal:: - - Rerenk model will be loaded to AUTO device for text reranking - - -Select device for LLM model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - llm_device = device_widget("CPU", exclude=["NPU"]) - - llm_device - - -.. parsed-literal:: - - [ERROR] 03:22:21.229 [NPUBackends] Cannot find backend for inference. Make sure the device is available. - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -.. code:: ipython3 - - print(f"LLM model will be loaded to {llm_device.value} device for response generation") - - -.. parsed-literal:: - - LLM model will be loaded to CPU device for response generation - - -Load models ------------ - - - -Load embedding model -~~~~~~~~~~~~~~~~~~~~ - - - -Now a Hugging Face embedding model can be supported by OpenVINO through -`OpenVINOEmbeddings `__ -and -`OpenVINOBgeEmbeddings `__\ classes -of LangChain. - -.. code:: ipython3 - - from langchain_community.embeddings import OpenVINOBgeEmbeddings - - embedding_model_name = npu_embedding_dir if USING_NPU else embedding_model_id.value - batch_size = 1 if USING_NPU else 4 - embedding_model_kwargs = {"device": embedding_device.value, "compile": False} - encode_kwargs = { - "mean_pooling": embedding_model_configuration["mean_pooling"], - "normalize_embeddings": embedding_model_configuration["normalize_embeddings"], - "batch_size": batch_size, - } - - embedding = OpenVINOBgeEmbeddings( - model_name_or_path=embedding_model_name, - model_kwargs=embedding_model_kwargs, - encode_kwargs=encode_kwargs, - ) - if USING_NPU: - embedding.ov_model.reshape(1, 512) - embedding.ov_model.compile() - - text = "This is a test document." - embedding_result = embedding.embed_query(text) - embedding_result[:3] - - -.. parsed-literal:: - - Compiling the model to AUTO ... - - -.. parsed-literal:: - - [ERROR] 03:22:26.363 [NPUBackends] Cannot find backend for inference. Make sure the device is available. - - - - -.. parsed-literal:: - - [-0.04208654910326004, 0.06681869924068451, 0.007916687056422234] - - - -Load rerank model -~~~~~~~~~~~~~~~~~ - - - -Now a Hugging Face embedding model can be supported by OpenVINO through -`OpenVINOReranker `__ -class of LangChain. - - **Note**: Rerank can be skipped in RAG. - -.. code:: ipython3 - - from langchain_community.document_compressors.openvino_rerank import OpenVINOReranker - - rerank_model_name = rerank_model_id.value - rerank_model_kwargs = {"device": rerank_device.value} - rerank_top_n = 2 - - reranker = OpenVINOReranker( - model_name_or_path=rerank_model_name, - model_kwargs=rerank_model_kwargs, - top_n=rerank_top_n, - ) - - -.. parsed-literal:: - - Compiling the model to AUTO ... - - -Load LLM model -~~~~~~~~~~~~~~ - - - -OpenVINO models can be run locally through the ``HuggingFacePipeline`` -class. To deploy a model with OpenVINO, you can specify the -``backend="openvino"`` parameter to trigger OpenVINO as backend -inference framework. - -.. code:: ipython3 - - available_models = [] - if int4_model_dir.exists(): - available_models.append("INT4") - if int8_model_dir.exists(): - available_models.append("INT8") - if fp16_model_dir.exists(): - available_models.append("FP16") - - model_to_run = widgets.Dropdown( - options=available_models, - value=available_models[0], - description="Model to run:", - disabled=False, - ) - - model_to_run - - - - -.. parsed-literal:: - - Dropdown(description='Model to run:', options=('INT4',), value='INT4') - - - -OpenVINO models can be run locally through the ``HuggingFacePipeline`` -class in -`LangChain `__. -To deploy a model with OpenVINO, you can specify the -``backend="openvino"`` parameter to trigger OpenVINO as backend -inference framework. - -.. code:: ipython3 - - from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline - - import openvino.properties as props - import openvino.properties.hint as hints - import openvino.properties.streams as streams - - - if model_to_run.value == "INT4": - model_dir = int4_model_dir - elif model_to_run.value == "INT8": - model_dir = int8_model_dir - else: - model_dir = fp16_model_dir - print(f"Loading model from {model_dir}") - - ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""} - - if "GPU" in llm_device.value and "qwen2-7b-instruct" in llm_model_id.value: - ov_config["GPU_ENABLE_SDPA_OPTIMIZATION"] = "NO" - - # On a GPU device a model is executed in FP16 precision. For red-pajama-3b-chat model there known accuracy - # issues caused by this, which we avoid by setting precision hint to "f32". - if llm_model_id.value == "red-pajama-3b-chat" and "GPU" in core.available_devices and llm_device.value in ["GPU", "AUTO"]: - ov_config["INFERENCE_PRECISION_HINT"] = "f32" - - llm = HuggingFacePipeline.from_model_id( - model_id=str(model_dir), - task="text-generation", - backend="openvino", - model_kwargs={ - "device": llm_device.value, - "ov_config": ov_config, - "trust_remote_code": True, - }, - pipeline_kwargs={"max_new_tokens": 2}, - ) - - if llm.pipeline.tokenizer.eos_token_id: - llm.pipeline.tokenizer.pad_token_id = llm.pipeline.tokenizer.eos_token_id - - llm.invoke("2 + 2 =") - - -.. parsed-literal:: - - Loading model from phi-3-mini-instruct/INT4_compressed_weights - - -.. parsed-literal:: - - Compiling the model to CPU ... - - - - -.. parsed-literal:: - - '2 + 2 = 4' - - - -Run QA over Document --------------------- - - - -Now, when model created, we can setup Chatbot interface using -`Gradio `__. - -A typical RAG application has two main components: - -- **Indexing**: a pipeline for ingesting data from a source and - indexing it. This usually happen offline. - -- **Retrieval and generation**: the actual RAG chain, which takes the - user query at run time and retrieves the relevant data from the - index, then passes that to the model. - -The most common full sequence from raw data to answer looks like: - -**Indexing** - -1. ``Load``: First we need to load our data. We’ll use DocumentLoaders - for this. -2. ``Split``: Text splitters break large Documents into smaller chunks. - This is useful both for indexing data and for passing it in to a - model, since large chunks are harder to search over and won’t in a - model’s finite context window. -3. ``Store``: We need somewhere to store and index our splits, so that - they can later be searched over. This is often done using a - VectorStore and Embeddings model. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/91237924/dfed2ba3-0c3a-4e0e-a2a7-01638730486a - :alt: Indexing pipeline - - Indexing pipeline - -**Retrieval and generation** - -1. ``Retrieve``: Given a user input, relevant splits are retrieved from - storage using a Retriever. -2. ``Generate``: A LLM produces an answer using a prompt that includes - the question and the retrieved data. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/91237924/f0545ddc-c0cd-4569-8c86-9879fdab105a - :alt: Retrieval and generation pipeline - - Retrieval and generation pipeline - -.. code:: ipython3 - - import re - from typing import List - from langchain.text_splitter import ( - CharacterTextSplitter, - RecursiveCharacterTextSplitter, - MarkdownTextSplitter, - ) - from langchain.document_loaders import ( - CSVLoader, - EverNoteLoader, - PyPDFLoader, - TextLoader, - UnstructuredEPubLoader, - UnstructuredHTMLLoader, - UnstructuredMarkdownLoader, - UnstructuredODTLoader, - UnstructuredPowerPointLoader, - UnstructuredWordDocumentLoader, - ) - - - class ChineseTextSplitter(CharacterTextSplitter): - def __init__(self, pdf: bool = False, **kwargs): - super().__init__(**kwargs) - self.pdf = pdf - - def split_text(self, text: str) -> List[str]: - if self.pdf: - text = re.sub(r"\n{3,}", "\n", text) - text = text.replace("\n\n", "") - sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') - sent_list = [] - for ele in sent_sep_pattern.split(text): - if sent_sep_pattern.match(ele) and sent_list: - sent_list[-1] += ele - elif ele: - sent_list.append(ele) - return sent_list - - - TEXT_SPLITERS = { - "Character": CharacterTextSplitter, - "RecursiveCharacter": RecursiveCharacterTextSplitter, - "Markdown": MarkdownTextSplitter, - "Chinese": ChineseTextSplitter, - } - - - LOADERS = { - ".csv": (CSVLoader, {}), - ".doc": (UnstructuredWordDocumentLoader, {}), - ".docx": (UnstructuredWordDocumentLoader, {}), - ".enex": (EverNoteLoader, {}), - ".epub": (UnstructuredEPubLoader, {}), - ".html": (UnstructuredHTMLLoader, {}), - ".md": (UnstructuredMarkdownLoader, {}), - ".odt": (UnstructuredODTLoader, {}), - ".pdf": (PyPDFLoader, {}), - ".ppt": (UnstructuredPowerPointLoader, {}), - ".pptx": (UnstructuredPowerPointLoader, {}), - ".txt": (TextLoader, {"encoding": "utf8"}), - } - - if model_language.value == "English": - text_example_path = "text_example_en.pdf" - else: - text_example_path = "text_example_cn.pdf" - -We can build a RAG pipeline of LangChain through -`create_retrieval_chain `__, -which will help to create a chain to connect RAG components including: - -- `Vector stores `__\ , -- `Retrievers `__ -- `LLM `__ -- `Embedding `__ - -.. code:: ipython3 - - from langchain.prompts import PromptTemplate - from langchain_community.vectorstores import FAISS - from langchain.chains.retrieval import create_retrieval_chain - from langchain.chains.combine_documents import create_stuff_documents_chain - from langchain.docstore.document import Document - from langchain.retrievers import ContextualCompressionRetriever - from threading import Thread - import gradio as gr - - stop_tokens = llm_model_configuration.get("stop_tokens") - rag_prompt_template = llm_model_configuration["rag_prompt_template"] - - - class StopOnTokens(StoppingCriteria): - def __init__(self, token_ids): - self.token_ids = token_ids - - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: - for stop_id in self.token_ids: - if input_ids[0][-1] == stop_id: - return True - return False - - - if stop_tokens is not None: - if isinstance(stop_tokens[0], str): - stop_tokens = llm.pipeline.tokenizer.convert_tokens_to_ids(stop_tokens) - - stop_tokens = [StopOnTokens(stop_tokens)] - - - def load_single_document(file_path: str) -> List[Document]: - """ - helper for loading a single document - - Params: - file_path: document path - Returns: - documents loaded - - """ - ext = "." + file_path.rsplit(".", 1)[-1] - if ext in LOADERS: - loader_class, loader_args = LOADERS[ext] - loader = loader_class(file_path, **loader_args) - return loader.load() - - raise ValueError(f"File does not exist '{ext}'") - - - def default_partial_text_processor(partial_text: str, new_text: str): - """ - helper for updating partially generated answer, used by default - - Params: - partial_text: text buffer for storing previosly generated text - new_text: text update for the current step - Returns: - updated text string - - """ - partial_text += new_text - return partial_text - - - text_processor = llm_model_configuration.get("partial_text_processor", default_partial_text_processor) - - - def create_vectordb( - docs, spliter_name, chunk_size, chunk_overlap, vector_search_top_k, vector_rerank_top_n, run_rerank, search_method, score_threshold, progress=gr.Progress() - ): - """ - Initialize a vector database - - Params: - doc: orignal documents provided by user - spliter_name: spliter method - chunk_size: size of a single sentence chunk - chunk_overlap: overlap size between 2 chunks - vector_search_top_k: Vector search top k - vector_rerank_top_n: Search rerank top n - run_rerank: whether run reranker - search_method: top k search method - score_threshold: score threshold when selecting 'similarity_score_threshold' method - - """ - global db - global retriever - global combine_docs_chain - global rag_chain - - if vector_rerank_top_n > vector_search_top_k: - gr.Warning("Search top k must >= Rerank top n") - - documents = [] - for doc in docs: - if type(doc) is not str: - doc = doc.name - documents.extend(load_single_document(doc)) - - text_splitter = TEXT_SPLITERS[spliter_name](chunk_size=chunk_size, chunk_overlap=chunk_overlap) - - texts = text_splitter.split_documents(documents) - db = FAISS.from_documents(texts, embedding) - if search_method == "similarity_score_threshold": - search_kwargs = {"k": vector_search_top_k, "score_threshold": score_threshold} - else: - search_kwargs = {"k": vector_search_top_k} - retriever = db.as_retriever(search_kwargs=search_kwargs, search_type=search_method) - if run_rerank: - reranker.top_n = vector_rerank_top_n - retriever = ContextualCompressionRetriever(base_compressor=reranker, base_retriever=retriever) - prompt = PromptTemplate.from_template(rag_prompt_template) - combine_docs_chain = create_stuff_documents_chain(llm, prompt) - - rag_chain = create_retrieval_chain(retriever, combine_docs_chain) - - return "Vector database is Ready" - - - def update_retriever(vector_search_top_k, vector_rerank_top_n, run_rerank, search_method, score_threshold): - """ - Update retriever - - Params: - vector_search_top_k: Vector search top k - vector_rerank_top_n: Search rerank top n - run_rerank: whether run reranker - search_method: top k search method - score_threshold: score threshold when selecting 'similarity_score_threshold' method - - """ - global db - global retriever - global combine_docs_chain - global rag_chain - - if vector_rerank_top_n > vector_search_top_k: - gr.Warning("Search top k must >= Rerank top n") - - if search_method == "similarity_score_threshold": - search_kwargs = {"k": vector_search_top_k, "score_threshold": score_threshold} - else: - search_kwargs = {"k": vector_search_top_k} - retriever = db.as_retriever(search_kwargs=search_kwargs, search_type=search_method) - if run_rerank: - retriever = ContextualCompressionRetriever(base_compressor=reranker, base_retriever=retriever) - reranker.top_n = vector_rerank_top_n - rag_chain = create_retrieval_chain(retriever, combine_docs_chain) - - return "Vector database is Ready" - - - def bot(history, temperature, top_p, top_k, repetition_penalty, hide_full_prompt, do_rag): - """ - callback function for running chatbot on submit button click - - Params: - history: conversation history - temperature: parameter for control the level of creativity in AI-generated text. - By adjusting the `temperature`, you can influence the AI model's probability distribution, making the text more focused or diverse. - top_p: parameter for control the range of tokens considered by the AI model based on their cumulative probability. - top_k: parameter for control the range of tokens considered by the AI model based on their cumulative probability, selecting number of tokens with highest probability. - repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text. - hide_full_prompt: whether to show searching results in promopt. - do_rag: whether do RAG when generating texts. - - """ - streamer = TextIteratorStreamer( - llm.pipeline.tokenizer, - timeout=3600.0, - skip_prompt=hide_full_prompt, - skip_special_tokens=True, - ) - pipeline_kwargs = dict( - max_new_tokens=512, - temperature=temperature, - do_sample=temperature > 0.0, - top_p=top_p, - top_k=top_k, - repetition_penalty=repetition_penalty, - streamer=streamer, - ) - if stop_tokens is not None: - pipeline_kwargs["stopping_criteria"] = StoppingCriteriaList(stop_tokens) - - llm.pipeline_kwargs = pipeline_kwargs - if do_rag: - t1 = Thread(target=rag_chain.invoke, args=({"input": history[-1][0]},)) - else: - input_text = rag_prompt_template.format(input=history[-1][0], context="") - t1 = Thread(target=llm.invoke, args=(input_text,)) - t1.start() - - # Initialize an empty string to store the generated text - partial_text = "" - for new_text in streamer: - partial_text = text_processor(partial_text, new_text) - history[-1][1] = partial_text - yield history - - - def request_cancel(): - llm.pipeline.model.request.cancel() - - - # initialize the vector store with example document - create_vectordb( - [text_example_path], - "RecursiveCharacter", - chunk_size=400, - chunk_overlap=50, - vector_search_top_k=10, - vector_rerank_top_n=2, - run_rerank=True, - search_method="similarity_score_threshold", - score_threshold=0.5, - ) - - - - -.. parsed-literal:: - - 'Vector database is Ready' - - - -Next we can create a Gradio UI and run demo. - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llm-rag-langchain/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo( - load_doc_fn=create_vectordb, - run_fn=bot, - stop_fn=request_cancel, - update_retriever_fn=update_retriever, - model_name=llm_model_id.value, - language=model_language.value, - ) - - try: - demo.queue().launch() - except Exception: - demo.queue().launch(share=True) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ - -.. code:: ipython3 - - # please uncomment and run this cell for stopping gradio interface - # demo.close() diff --git a/docs/notebooks/llm-rag-llamaindex-with-output.rst b/docs/notebooks/llm-rag-llamaindex-with-output.rst deleted file mode 100644 index a1d8fcace53a1c..00000000000000 --- a/docs/notebooks/llm-rag-llamaindex-with-output.rst +++ /dev/null @@ -1,1386 +0,0 @@ -Create a RAG system using OpenVINO and LlamaIndex -================================================= - -**Retrieval-augmented generation (RAG)** is a technique for augmenting -LLM knowledge with additional, often private or real-time, data. LLMs -can reason about wide-ranging topics, but their knowledge is limited to -the public data up to a specific point in time that they were trained -on. If you want to build AI applications that can reason about private -data or data introduced after a model’s cutoff date, you need to augment -the knowledge of the model with the specific information it needs. The -process of bringing the appropriate information and inserting it into -the model prompt is known as Retrieval Augmented Generation (RAG). - -`LlamaIndex `__ is a framework -for building context-augmented generative AI applications with -LLMs.LlamaIndex imposes no restriction on how you use LLMs. You can use -LLMs as auto-complete, chatbots, semi-autonomous agents, and more. It -just makes using them easier. - -The tutorial consists of the following steps: - -- Install prerequisites -- Download and convert the model from a public source using the - `OpenVINO integration with Hugging Face - Optimum `__. -- Compress model weights to 4-bit or 8-bit data types using - `NNCF `__ -- Create a RAG chain pipeline -- Run Q&A pipeline - -In this example, the customized RAG pipeline consists of following -components in order, where embedding, rerank and LLM will be deployed -with OpenVINO to optimize their inference performance. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/91237924/0076f6c7-75e4-4c2e-9015-87b355e5ca28 - :alt: RAG - - RAG - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Select model for inference <#select-model-for-inference>`__ -- `login to huggingfacehub to get access to pretrained - model <#login-to-huggingfacehub-to-get-access-to-pretrained-model>`__ -- `Convert model and compress model - weights <#convert-model-and-compress-model-weights>`__ - - - `LLM conversion and Weights Compression using - Optimum-CLI <#llm-conversion-and-weights-compression-using-optimum-cli>`__ - - - `Weight compression with AWQ <#weight-compression-with-awq>`__ - - - `Convert embedding model using - Optimum-CLI <#convert-embedding-model-using-optimum-cli>`__ - - `Convert rerank model using - Optimum-CLI <#convert-rerank-model-using-optimum-cli>`__ - -- `Select device for inference and model - variant <#select-device-for-inference-and-model-variant>`__ - - - `Select device for embedding model - inference <#select-device-for-embedding-model-inference>`__ - - `Select device for rerank model - inference <#select-device-for-rerank-model-inference>`__ - - `Select device for LLM model - inference <#select-device-for-llm-model-inference>`__ - -- `Load model <#load-model>`__ - - - `Load embedding model <#load-embedding-model>`__ - - `Load rerank model <#load-rerank-model>`__ - - `Load LLM model <#load-llm-model>`__ - -- `Run QA over Document <#run-qa-over-document>`__ -- `Gradio Demo <#gradio-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Install required dependencies - -.. code:: ipython3 - - import os - import requests - from pathlib import Path - - os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - with open("notebook_utils.py", "w") as f: - f.write(r.text) - - if not Path("pip_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/pip_helper.py", - ) - open("pip_helper.py", "w").write(r.text) - - from pip_helper import pip_install - - pip_install( - "-q", - "--extra-index-url", - "https://download.pytorch.org/whl/cpu", - "llama-index", - "faiss-cpu", - "pymupdf", - "langchain", - "llama-index-readers-file", - "llama-index-vector-stores-faiss", - "llama-index-llms-langchain", - "llama-index-llms-huggingface>=0.4.0,<0.5.0", - "llama-index-embeddings-huggingface>=0.4.0,<0.5.0", - "huggingface-hub>=0.26.5", - ) - pip_install("-q", "git+https://github.com/huggingface/optimum-intel.git", "git+https://github.com/openvinotoolkit/nncf.git", "datasets", "accelerate", "gradio") - pip_install("--pre", "-U", "openvino>=2024.2", "--extra-index-url", "https://storage.openvinotoolkit.org/simple/wheels/nightly") - pip_install("--pre", "-U", "openvino-tokenizers[transformers]>=2024.2", "--extra-index-url", "https://storage.openvinotoolkit.org/simple/wheels/nightly") - pip_install("-q", "--no-deps", "llama-index-llms-openvino>=0.3.1", "llama-index-embeddings-openvino>=0.2.1", "llama-index-postprocessor-openvino-rerank>=0.2.0") - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("llm-rag-llamaindex.ipynb") - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - - -.. code:: ipython3 - - import os - from pathlib import Path - import requests - import shutil - import io - - # fetch model configuration - - config_shared_path = Path("../../utils/llm_config.py") - config_dst_path = Path("llm_config.py") - text_example_en_path = Path("text_example_en.pdf") - text_example_cn_path = Path("text_example_cn.pdf") - text_example_en = "https://github.com/openvinotoolkit/openvino_notebooks/files/15039728/Platform.Brief_Intel.vPro.with.Intel.Core.Ultra_Final.pdf" - text_example_cn = "https://github.com/openvinotoolkit/openvino_notebooks/files/15039713/Platform.Brief_Intel.vPro.with.Intel.Core.Ultra_Final_CH.pdf" - - if not config_dst_path.exists(): - if config_shared_path.exists(): - try: - os.symlink(config_shared_path, config_dst_path) - except Exception: - shutil.copy(config_shared_path, config_dst_path) - else: - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py") - with open("llm_config.py", "w", encoding="utf-8") as f: - f.write(r.text) - elif not os.path.islink(config_dst_path): - print("LLM config will be updated") - if config_shared_path.exists(): - shutil.copy(config_shared_path, config_dst_path) - else: - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py") - with open("llm_config.py", "w", encoding="utf-8") as f: - f.write(r.text) - - - if not text_example_en_path.exists(): - r = requests.get(url=text_example_en) - content = io.BytesIO(r.content) - with open("text_example_en.pdf", "wb") as f: - f.write(content.read()) - - if not text_example_cn_path.exists(): - r = requests.get(url=text_example_cn) - content = io.BytesIO(r.content) - with open("text_example_cn.pdf", "wb") as f: - f.write(content.read()) - - -.. parsed-literal:: - - LLM config will be updated - - -Select model for inference --------------------------- - - - -The tutorial supports different models, you can select one from the -provided options to compare the quality of open source LLM solutions. - - **Note**: conversion of some models can require additional actions - from user side and at least 64GB RAM for conversion. - -The available embedding model options are: - -- `bge-small-en-v1.5 `__ -- `bge-small-zh-v1.5 `__ -- `bge-large-en-v1.5 `__ -- `bge-large-zh-v1.5 `__ -- `bge-m3 `__ - -BGE embedding is a general Embedding Model. The model is pre-trained -using RetroMAE and trained on large-scale pair data using contrastive -learning. - -The available rerank model options are: - -- `bge-reranker-v2-m3 `__ -- `bge-reranker-large `__ -- `bge-reranker-base `__ - -Reranker model with cross-encoder will perform full-attention over the -input pair, which is more accurate than embedding model (i.e., -bi-encoder) but more time-consuming than embedding model. Therefore, it -can be used to re-rank the top-k documents returned by embedding model. - -You can also find available LLM model options in -`llm-chatbot `__ notebook. - -.. code:: ipython3 - - from pathlib import Path - import ipywidgets as widgets - from notebook_utils import device_widget, optimize_bge_embedding - -Convert model and compress model weights ----------------------------------------- - - - -The Weights Compression algorithm is aimed at compressing the weights of -the models and can be used to optimize the model footprint and -performance of large models where the size of weights is relatively -larger than the size of activations, for example, Large Language Models -(LLM). Compared to INT8 compression, INT4 compression improves -performance even more, but introduces a minor drop in prediction -quality. - -.. code:: ipython3 - - from llm_config import ( - SUPPORTED_EMBEDDING_MODELS, - SUPPORTED_RERANK_MODELS, - SUPPORTED_LLM_MODELS, - ) - - model_languages = list(SUPPORTED_LLM_MODELS) - - model_language = widgets.Dropdown( - options=model_languages, - value=model_languages[0], - description="Model Language:", - disabled=False, - ) - - model_language - - - - -.. parsed-literal:: - - Dropdown(description='Model Language:', options=('English', 'Chinese', 'Japanese'), value='English') - - - -.. code:: ipython3 - - llm_model_ids = [model_id for model_id, model_config in SUPPORTED_LLM_MODELS[model_language.value].items() if model_config.get("rag_prompt_template")] - - llm_model_id = widgets.Dropdown( - options=llm_model_ids, - value=llm_model_ids[-1], - description="Model:", - disabled=False, - ) - - llm_model_id - - - - -.. parsed-literal:: - - Dropdown(description='Model:', index=13, options=('tiny-llama-1b-chat', 'gemma-2b-it', 'red-pajama-3b-chat', '… - - - -.. code:: ipython3 - - llm_model_configuration = SUPPORTED_LLM_MODELS[model_language.value][llm_model_id.value] - print(f"Selected LLM model {llm_model_id.value}") - - -.. parsed-literal:: - - Selected LLM model phi-3-mini-instruct - - -`Optimum Intel `__ is -the interface between the -`Transformers `__ and -`Diffusers `__ libraries -and OpenVINO to accelerate end-to-end pipelines on Intel architectures. -It provides ease-to-use cli interface for exporting models to `OpenVINO -Intermediate Representation -(IR) `__ -format. - -The command bellow demonstrates basic command for model export with -``optimum-cli`` - -:: - - optimum-cli export openvino --model --task - -where ``--model`` argument is model id from HuggingFace Hub or local -directory with model (saved using ``.save_pretrained`` method), -``--task`` is one of `supported -task `__ -that exported model should solve. For LLMs it will be -``text-generation-with-past``. If model initialization requires to use -remote code, ``--trust-remote-code`` flag additionally should be passed. - -LLM conversion and Weights Compression using Optimum-CLI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -You can also apply fp16, 8-bit or 4-bit weight compression on the -Linear, Convolutional and Embedding layers when exporting your model -with the CLI by setting ``--weight-format`` to respectively fp16, int8 -or int4. This type of optimization allows to reduce the memory footprint -and inference latency. By default the quantization scheme for int8/int4 -will be -`asymmetric `__, -to make it -`symmetric `__ -you can add ``--sym``. - -For INT4 quantization you can also specify the following arguments : - -- The ``--group-size`` parameter will define the group size to use for - quantization, -1 it will results in per-column quantization. -- The ``--ratio`` parameter controls the ratio between 4-bit and 8-bit - quantization. If set to 0.9, it means that 90% of the layers will be - quantized to int4 while 10% will be quantized to int8. - -Smaller group_size and ratio values usually improve accuracy at the -sacrifice of the model size and inference latency. - - **Note**: There may be no speedup for INT4/INT8 compressed models on - dGPU. - -.. code:: ipython3 - - from IPython.display import Markdown, display - - prepare_int4_model = widgets.Checkbox( - value=True, - description="Prepare INT4 model", - disabled=False, - ) - prepare_int8_model = widgets.Checkbox( - value=False, - description="Prepare INT8 model", - disabled=False, - ) - prepare_fp16_model = widgets.Checkbox( - value=False, - description="Prepare FP16 model", - disabled=False, - ) - - display(prepare_int4_model) - display(prepare_int8_model) - display(prepare_fp16_model) - - - -.. parsed-literal:: - - Checkbox(value=True, description='Prepare INT4 model') - - - -.. parsed-literal:: - - Checkbox(value=False, description='Prepare INT8 model') - - - -.. parsed-literal:: - - Checkbox(value=False, description='Prepare FP16 model') - - -Weight compression with AWQ -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -`Activation-aware Weight -Quantization `__ (AWQ) is an algorithm -that tunes model weights for more accurate INT4 compression. It slightly -improves generation quality of compressed LLMs, but requires significant -additional time for tuning weights on a calibration dataset. We use -``wikitext-2-raw-v1/train`` subset of the -`Wikitext `__ -dataset for calibration. - -Below you can enable AWQ to be additionally applied during model export -with INT4 precision. - - **Note**: Applying AWQ requires significant memory and time. - -.. - - **Note**: It is possible that there will be no matching patterns in - the model to apply AWQ, in such case it will be skipped. - -.. code:: ipython3 - - enable_awq = widgets.Checkbox( - value=False, - description="Enable AWQ", - disabled=not prepare_int4_model.value, - ) - display(enable_awq) - - - -.. parsed-literal:: - - Checkbox(value=False, description='Enable AWQ') - - -.. code:: ipython3 - - pt_model_id = llm_model_configuration["model_id"] - pt_model_name = llm_model_id.value.split("-")[0] - fp16_model_dir = Path(llm_model_id.value) / "FP16" - int8_model_dir = Path(llm_model_id.value) / "INT8_compressed_weights" - int4_model_dir = Path(llm_model_id.value) / "INT4_compressed_weights" - - - def convert_to_fp16(): - if (fp16_model_dir / "openvino_model.xml").exists(): - return - remote_code = llm_model_configuration.get("remote_code", False) - export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format fp16".format(pt_model_id) - if remote_code: - export_command_base += " --trust-remote-code" - export_command = export_command_base + " " + str(fp16_model_dir) - display(Markdown("**Export command:**")) - display(Markdown(f"`{export_command}`")) - ! $export_command - - - def convert_to_int8(): - if (int8_model_dir / "openvino_model.xml").exists(): - return - int8_model_dir.mkdir(parents=True, exist_ok=True) - remote_code = llm_model_configuration.get("remote_code", False) - export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format int8".format(pt_model_id) - if remote_code: - export_command_base += " --trust-remote-code" - export_command = export_command_base + " " + str(int8_model_dir) - display(Markdown("**Export command:**")) - display(Markdown(f"`{export_command}`")) - ! $export_command - - - def convert_to_int4(): - compression_configs = { - "zephyr-7b-beta": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "mistral-7b": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "minicpm-2b-dpo": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "gemma-2b-it": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "notus-7b-v1": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "neural-chat-7b-v3-1": { - "sym": True, - "group_size": 64, - "ratio": 0.6, - }, - "llama-2-chat-7b": { - "sym": True, - "group_size": 128, - "ratio": 0.8, - }, - "llama-3-8b-instruct": { - "sym": True, - "group_size": 128, - "ratio": 0.8, - }, - "gemma-7b-it": { - "sym": True, - "group_size": 128, - "ratio": 0.8, - }, - "chatglm2-6b": { - "sym": True, - "group_size": 128, - "ratio": 0.72, - }, - "qwen-7b-chat": {"sym": True, "group_size": 128, "ratio": 0.6}, - "red-pajama-3b-chat": { - "sym": False, - "group_size": 128, - "ratio": 0.5, - }, - "qwen2.5-7b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "qwen2.5-3b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "qwen2.5-14b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "qwen2.5-1.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, - "default": { - "sym": False, - "group_size": 128, - "ratio": 0.8, - }, - } - - model_compression_params = compression_configs.get(llm_model_id.value, compression_configs["default"]) - if (int4_model_dir / "openvino_model.xml").exists(): - return - remote_code = llm_model_configuration.get("remote_code", False) - export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format int4".format(pt_model_id) - int4_compression_args = " --group-size {} --ratio {}".format(model_compression_params["group_size"], model_compression_params["ratio"]) - if model_compression_params["sym"]: - int4_compression_args += " --sym" - if enable_awq.value: - int4_compression_args += " --awq --dataset wikitext2 --num-samples 128" - export_command_base += int4_compression_args - if remote_code: - export_command_base += " --trust-remote-code" - export_command = export_command_base + " " + str(int4_model_dir) - display(Markdown("**Export command:**")) - display(Markdown(f"`{export_command}`")) - ! $export_command - - - if prepare_fp16_model.value: - convert_to_fp16() - if prepare_int8_model.value: - convert_to_int8() - if prepare_int4_model.value: - convert_to_int4() - -Let’s compare model size for different compression types - -.. code:: ipython3 - - fp16_weights = fp16_model_dir / "openvino_model.bin" - int8_weights = int8_model_dir / "openvino_model.bin" - int4_weights = int4_model_dir / "openvino_model.bin" - - if fp16_weights.exists(): - print(f"Size of FP16 model is {fp16_weights.stat().st_size / 1024 / 1024:.2f} MB") - for precision, compressed_weights in zip([8, 4], [int8_weights, int4_weights]): - if compressed_weights.exists(): - print(f"Size of model with INT{precision} compressed weights is {compressed_weights.stat().st_size / 1024 / 1024:.2f} MB") - if compressed_weights.exists() and fp16_weights.exists(): - print(f"Compression rate for INT{precision} model: {fp16_weights.stat().st_size / compressed_weights.stat().st_size:.3f}") - - -.. parsed-literal:: - - Size of model with INT4 compressed weights is 2319.41 MB - - -Convert embedding model using Optimum-CLI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Since some embedding models can only support limited languages, we can -filter them out according the LLM you selected. - -.. code:: ipython3 - - embedding_model_id = list(SUPPORTED_EMBEDDING_MODELS[model_language.value]) - - embedding_model_id = widgets.Dropdown( - options=embedding_model_id, - value=embedding_model_id[0], - description="Embedding Model:", - disabled=False, - ) - - embedding_model_id - - - - -.. parsed-literal:: - - Dropdown(description='Embedding Model:', options=('bge-small-en-v1.5', 'bge-large-en-v1.5', 'bge-m3'), value='… - - - -.. code:: ipython3 - - embedding_model_configuration = SUPPORTED_EMBEDDING_MODELS[model_language.value][embedding_model_id.value] - print(f"Selected {embedding_model_id.value} model") - - -.. parsed-literal:: - - Selected bge-small-en-v1.5 model - - -OpenVINO embedding model and tokenizer can be exported by -``feature-extraction`` task with ``optimum-cli``. - -.. code:: ipython3 - - export_command_base = "optimum-cli export openvino --model {} --task feature-extraction".format(embedding_model_configuration["model_id"]) - export_command = export_command_base + " " + str(embedding_model_id.value) - - if not Path(embedding_model_id.value).exists(): - ! $export_command - -Convert rerank model using Optimum-CLI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - rerank_model_id = list(SUPPORTED_RERANK_MODELS) - - rerank_model_id = widgets.Dropdown( - options=rerank_model_id, - value=rerank_model_id[0], - description="Rerank Model:", - disabled=False, - ) - - rerank_model_id - - - - -.. parsed-literal:: - - Dropdown(description='Rerank Model:', options=('bge-reranker-v2-m3', 'bge-reranker-large', 'bge-reranker-base'… - - - -.. code:: ipython3 - - rerank_model_configuration = SUPPORTED_RERANK_MODELS[rerank_model_id.value] - print(f"Selected {rerank_model_id.value} model") - - -.. parsed-literal:: - - Selected bge-reranker-v2-m3 model - - -Since ``rerank`` model is sort of sentence classification task, its -OpenVINO IR and tokenizer can be exported by ``text-classification`` -task with ``optimum-cli``. - -.. code:: ipython3 - - export_command_base = "optimum-cli export openvino --model {} --task text-classification".format(rerank_model_configuration["model_id"]) - export_command = export_command_base + " " + str(rerank_model_id.value) - - if not Path(rerank_model_id.value).exists(): - ! $export_command - -Select device for inference and model variant ---------------------------------------------- - - - - **Note**: There may be no speedup for INT4/INT8 compressed models on - dGPU. - -Select device for embedding model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - embedding_device = device_widget() - - embedding_device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -.. code:: ipython3 - - print(f"Embedding model will be loaded to {embedding_device.value} device for text embedding") - - -.. parsed-literal:: - - Embedding model will be loaded to CPU device for text embedding - - -Optimize the BGE embedding model’s parameter precision when loading -model to NPU device. - -.. code:: ipython3 - - USING_NPU = embedding_device.value == "NPU" - - npu_embedding_dir = embedding_model_id.value + "-npu" - npu_embedding_path = Path(npu_embedding_dir) / "openvino_model.xml" - - if USING_NPU and not Path(npu_embedding_dir).exists(): - shutil.copytree(embedding_model_id.value, npu_embedding_dir) - optimize_bge_embedding(Path(embedding_model_id.value) / "openvino_model.xml", npu_embedding_path) - -Select device for rerank model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - rerank_device = device_widget() - - rerank_device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -.. code:: ipython3 - - print(f"Rerenk model will be loaded to {rerank_device.value} device for text reranking") - - -.. parsed-literal:: - - Rerenk model will be loaded to CPU device for text reranking - - -Select device for LLM model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - llm_device = device_widget("CPU", exclude=["NPU"]) - llm_device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -.. code:: ipython3 - - print(f"LLM model will be loaded to {llm_device.value} device for response generation") - - -.. parsed-literal:: - - LLM model will be loaded to CPU device for response generation - - -Load models ------------ - - - -Load embedding model -~~~~~~~~~~~~~~~~~~~~ - - - -Now a Hugging Face embedding model can be supported by OpenVINO through -`OpenVINOEmbeddings `__ -class of LlamaIndex. - -.. code:: ipython3 - - from llama_index.embeddings.huggingface_openvino import OpenVINOEmbedding - - embedding_model_name = npu_embedding_dir if USING_NPU else embedding_model_id.value - batch_size = 1 if USING_NPU else 4 - - embedding = OpenVINOEmbedding( - model_id_or_path=embedding_model_name, embed_batch_size=batch_size, device=embedding_device.value, model_kwargs={"compile": False} - ) - if USING_NPU: - embedding._model.reshape(1, 512) - embedding._model.compile() - - embeddings = embedding.get_text_embedding("Hello World!") - print(len(embeddings)) - print(embeddings[:5]) - - -.. parsed-literal:: - - Compiling the model to CPU ... - - -.. parsed-literal:: - - 384 - [-0.003275666618719697, -0.01169075071811676, 0.04155930131673813, -0.03814813867211342, 0.02418304793536663] - - -Load rerank model -~~~~~~~~~~~~~~~~~ - - - -Now a Hugging Face embedding model can be supported by OpenVINO through -`OpenVINORerank `__ -class of LlamaIndex. - - **Note**: Rerank can be skipped in RAG. - -.. code:: ipython3 - - from llama_index.postprocessor.openvino_rerank import OpenVINORerank - - reranker = OpenVINORerank(model_id_or_path=rerank_model_id.value, device=rerank_device.value, top_n=2) - - -.. parsed-literal:: - - Compiling the model to CPU ... - - -Load LLM model -~~~~~~~~~~~~~~ - - - -OpenVINO models can be run locally through the ``HuggingFacePipeline`` -class. To deploy a model with OpenVINO, you can specify the -``backend="openvino"`` parameter to trigger OpenVINO as backend -inference framework. - -.. code:: ipython3 - - available_models = [] - if int4_model_dir.exists(): - available_models.append("INT4") - if int8_model_dir.exists(): - available_models.append("INT8") - if fp16_model_dir.exists(): - available_models.append("FP16") - - model_to_run = widgets.Dropdown( - options=available_models, - value=available_models[0], - description="Model to run:", - disabled=False, - ) - - model_to_run - - - - -.. parsed-literal:: - - Dropdown(description='Model to run:', options=('INT4',), value='INT4') - - - -OpenVINO models can be run locally through the ``OpenVINOLLM`` class in -`LlamaIndex `__. -If you have an Intel GPU, you can specify ``device_map="gpu"`` to run -inference on it. - -.. code:: ipython3 - - from llama_index.llms.openvino import OpenVINOLLM - - import openvino.properties as props - import openvino.properties.hint as hints - import openvino.properties.streams as streams - - - if model_to_run.value == "INT4": - model_dir = int4_model_dir - elif model_to_run.value == "INT8": - model_dir = int8_model_dir - else: - model_dir = fp16_model_dir - print(f"Loading model from {model_dir}") - - ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""} - - stop_tokens = llm_model_configuration.get("stop_tokens") - completion_to_prompt = llm_model_configuration.get("completion_to_prompt") - - if "GPU" in llm_device.value and "qwen2-7b-instruct" in llm_model_id.value: - ov_config["GPU_ENABLE_SDPA_OPTIMIZATION"] = "NO" - - # On a GPU device a model is executed in FP16 precision. For red-pajama-3b-chat model there known accuracy - # issues caused by this, which we avoid by setting precision hint to "f32". - if llm_model_id.value == "red-pajama-3b-chat" and "GPU" in core.available_devices and llm_device.value in ["GPU", "AUTO"]: - ov_config["INFERENCE_PRECISION_HINT"] = "f32" - - llm = OpenVINOLLM( - model_id_or_path=str(model_dir), - context_window=3900, - max_new_tokens=2, - model_kwargs={"ov_config": ov_config, "trust_remote_code": True}, - generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95}, - completion_to_prompt=completion_to_prompt, - device_map=llm_device.value, - ) - - response = llm.complete("2 + 2 =") - print(str(response)) - - -.. parsed-literal:: - - /home/ethan/intel/openvino_notebooks/openvino_env/lib/python3.11/site-packages/pydantic/_internal/_fields.py:161: UserWarning: Field "model_id" has conflict with protected namespace "model_". - - You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`. - warnings.warn( - - -.. parsed-literal:: - - Loading model from phi-3-mini-instruct/INT4_compressed_weights - - - -.. parsed-literal:: - - configuration_phi3.py: 0%| | 0.00/11.2k [00:00 bool: - for stop_id in self.token_ids: - if input_ids[0][-1] == stop_id: - return True - return False - - - if stop_tokens is not None: - if isinstance(stop_tokens[0], str): - stop_tokens = llm._tokenizer.convert_tokens_to_ids(stop_tokens) - stop_tokens = [StopOnTokens(stop_tokens)] - - loader = PyMuPDFReader() - documents = loader.load(file_path=text_example_path) - - # dimensions of embedding model - d = embedding._model.request.outputs[0].get_partial_shape()[2].get_length() - faiss_index = faiss.IndexFlatL2(d) - Settings.embed_model = embedding - - llm.max_new_tokens = 2048 - if stop_tokens is not None: - llm._stopping_criteria = StoppingCriteriaList(stop_tokens) - Settings.llm = llm - - vector_store = FaissVectorStore(faiss_index=faiss_index) - storage_context = StorageContext.from_defaults(vector_store=vector_store) - index = VectorStoreIndex.from_documents( - documents, - storage_context=storage_context, - transformations=[SentenceSplitter(chunk_size=200, chunk_overlap=40)], - ) - -**Retrieval and generation** - -1. ``Retrieve``: Given a user input, relevant splits are retrieved from - storage using a Retriever. -2. ``Generate``: A LLM produces an answer using a prompt that includes - the question and the retrieved data. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/91237924/f0545ddc-c0cd-4569-8c86-9879fdab105a - :alt: Retrieval and generation pipeline - - Retrieval and generation pipeline - -.. code:: ipython3 - - query_engine = index.as_query_engine(streaming=True, similarity_top_k=10, node_postprocessors=[reranker]) - if model_language.value == "English": - query = "What can Intel vPro® Enterprise systems offer?" - else: - query = "英特尔博锐® Enterprise系统提供哪些功能?" - - streaming_response = query_engine.query(query) - streaming_response.print_response_stream() - - -.. parsed-literal:: - - /home/ethan/intel/openvino_notebooks/openvino_env/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:515: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.7` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`. - warnings.warn( - /home/ethan/intel/openvino_notebooks/openvino_env/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:520: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.95` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`. - warnings.warn( - - -.. parsed-literal:: - - - - Intel vPro® Enterprise systems can offer a range of advanced security features to protect network infrastructure. These include network security appliances, secure access service edge (SASE), next-generation firewall (NGFW), real-time deep packet inspection, antivirus, intrusion prevention and detection, and SSL/TLS inspection. These systems support more devices, users, and key capabilities such as real-time threat detection while processing higher network throughput. They also drive advanced security features for growing network infrastructure with enhanced power efficiency and density. - - Intel QuickAssist Technology (Intel QAT) accelerates and offloads key encryption/compression workloads from the CPU to free up CPU cycles. Trusted execution environments (TEEs) with Intel Software Guard Extensions (Intel SGX) and Intel Trust Domain Extensions (Intel TDX) help protect network workloads and encryption keys across edge-to-cloud infrastructure. - - In industrial and energy sectors, Intel vPro® Enterprise systems improve manageability and help reduce the operational costs of automation and control systems. Hardened platforms ensure system reliability in extreme conditions, and high core density provides more dedicated resources to VMs. - - Intel vPro® Enterprise systems also offer higher performance per watt, one-core density, and faster DDR5 memory bandwidth to enhance throughput and efficiency for edge security workloads. Intel QuickAssist Technology (Intel QAT) accelerates and offloads key encryption/compression workloads from the CPU to free up CPU cycles. Trusted execution environments (TEEs) with Intel Software Guard Extensions (Intel SGX) and Intel Trust Domain Extensions (Intel TDX) harden platforms from unauthorized access. - - Cache Allocation Technology (CAT) within the Intel® Resource Director Technology (Intel® RDT) framework enables performance prioritization for key applications to help meet real-time deterministic requirements. - - - - -Gradio Demo ------------ - - - -Now, when model created, we can setup Chatbot interface using -`Gradio `__. - -First we can check the default prompt template in LlamaIndex pipeline. - -.. code:: ipython3 - - prompts_dict = query_engine.get_prompts() - - - def display_prompt_dict(prompts_dict): - for k, p in prompts_dict.items(): - text_md = f"**Prompt Key**: {k}
" f"**Text:**
" - display(Markdown(text_md)) - print(p.get_template()) - display(Markdown("

")) - - - display_prompt_dict(prompts_dict) - - - -**Prompt Key**: response_synthesizer:text_qa_template\ **Text:** - - -.. parsed-literal:: - - Context information is below. - --------------------- - {context_str} - --------------------- - Given the context information and not prior knowledge, answer the query. - Query: {query_str} - Answer: - - - - - - - -**Prompt Key**: response_synthesizer:refine_template\ **Text:** - - -.. parsed-literal:: - - The original query is as follows: {query_str} - We have provided an existing answer: {existing_answer} - We have the opportunity to refine the existing answer (only if needed) with some more context below. - ------------ - {context_msg} - ------------ - Given the new context, refine the original answer to better answer the query. If the context isn't useful, return the original answer. - Refined Answer: - - - - - - -.. code:: ipython3 - - from langchain.text_splitter import RecursiveCharacterTextSplitter - from llama_index.core.node_parser import LangchainNodeParser - import gradio as gr - - TEXT_SPLITERS = { - "SentenceSplitter": SentenceSplitter, - "RecursiveCharacter": RecursiveCharacterTextSplitter, - } - - - def default_partial_text_processor(partial_text: str, new_text: str): - """ - helper for updating partially generated answer, used by default - - Params: - partial_text: text buffer for storing previosly generated text - new_text: text update for the current step - Returns: - updated text string - - """ - partial_text += new_text - return partial_text - - - text_processor = llm_model_configuration.get("partial_text_processor", default_partial_text_processor) - - - def create_vectordb(doc, spliter_name, chunk_size, chunk_overlap, vector_search_top_k, vector_rerank_top_n, run_rerank): - """ - Initialize a vector database - - Params: - doc: orignal documents provided by user - chunk_size: size of a single sentence chunk - chunk_overlap: overlap size between 2 chunks - vector_search_top_k: Vector search top k - vector_rerank_top_n: Rerrank top n - run_rerank: whether to run reranker - - """ - global query_engine - global index - - if vector_rerank_top_n > vector_search_top_k: - gr.Warning("Search top k must >= Rerank top n") - - loader = PyMuPDFReader() - documents = loader.load(file_path=doc.name) - spliter = TEXT_SPLITERS[spliter_name](chunk_size=chunk_size, chunk_overlap=chunk_overlap) - if spliter_name == "RecursiveCharacter": - spliter = LangchainNodeParser(spliter) - faiss_index = faiss.IndexFlatL2(d) - vector_store = FaissVectorStore(faiss_index=faiss_index) - storage_context = StorageContext.from_defaults(vector_store=vector_store) - - index = VectorStoreIndex.from_documents( - documents, - storage_context=storage_context, - transformations=[spliter], - ) - if run_rerank: - reranker.top_n = vector_rerank_top_n - query_engine = index.as_query_engine(streaming=True, similarity_top_k=vector_search_top_k, node_postprocessors=[reranker]) - else: - query_engine = index.as_query_engine(streaming=True, similarity_top_k=vector_search_top_k) - - return "Vector database is Ready" - - - def update_retriever(vector_search_top_k, vector_rerank_top_n, run_rerank): - """ - Update retriever - - Params: - vector_search_top_k: size of searching results - vector_rerank_top_n: size of rerank results - run_rerank: whether run rerank step - - """ - global query_engine - global index - - if vector_rerank_top_n > vector_search_top_k: - gr.Warning("Search top k must >= Rerank top n") - - if run_rerank: - reranker.top_n = vector_rerank_top_n - query_engine = index.as_query_engine(streaming=True, similarity_top_k=vector_search_top_k, node_postprocessors=[reranker]) - else: - query_engine = index.as_query_engine(streaming=True, similarity_top_k=vector_search_top_k) - - - def bot(history, temperature, top_p, top_k, repetition_penalty, do_rag): - """ - callback function for running chatbot on submit button click - - Params: - history: conversation history - temperature: parameter for control the level of creativity in AI-generated text. - By adjusting the `temperature`, you can influence the AI model's probability distribution, making the text more focused or diverse. - top_p: parameter for control the range of tokens considered by the AI model based on their cumulative probability. - top_k: parameter for control the range of tokens considered by the AI model based on their cumulative probability, selecting number of tokens with highest probability. - repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text. - do_rag: whether do RAG when generating texts. - - """ - llm.generate_kwargs = dict( - temperature=temperature, - do_sample=temperature > 0.0, - top_p=top_p, - top_k=top_k, - repetition_penalty=repetition_penalty, - ) - - partial_text = "" - if do_rag: - streaming_response = query_engine.query(history[-1][0]) - for new_text in streaming_response.response_gen: - partial_text = text_processor(partial_text, new_text) - history[-1][1] = partial_text - yield history - else: - streaming_response = llm.stream_complete(history[-1][0]) - for new_text in streaming_response: - partial_text = text_processor(partial_text, new_text.delta) - history[-1][1] = partial_text - yield history - - - def request_cancel(): - llm._model.request.cancel() - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llm-rag-llamaindex/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo( - load_doc_fn=create_vectordb, - run_fn=bot, - stop_fn=request_cancel, - update_retriever_fn=update_retriever, - model_name=llm_model_id.value, - language=model_language.value, - ) - - try: - demo.queue().launch() - except Exception: - demo.queue().launch(share=True) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ - -.. code:: ipython3 - - # please uncomment and run this cell for stopping gradio interface - # demo.close() diff --git a/docs/notebooks/localai-with-output.rst b/docs/notebooks/localai-with-output.rst deleted file mode 100644 index d5bab0d6b9a512..00000000000000 --- a/docs/notebooks/localai-with-output.rst +++ /dev/null @@ -1,231 +0,0 @@ -LocalAI and OpenVINO -==================== - -`LocalAI `__ is the free, Open Source OpenAI -alternative. LocalAI act as a drop-in replacement REST API that’s -compatible with OpenAI API specifications for local inferencing. It -allows you to run LLMs, generate images, audio (and not only) locally or -on-prem with consumer grade hardware, supporting multiple model families -and architectures. Does not require GPU. It is created and maintained by -``Ettore Di Giacinto``. - -In this tutorial we show how to prepare a model config and launch an -OpenVINO LLM model with LocalAI in docker container. - - -**Table of contents:** - - -- `Prepare Docker <#prepare-docker>`__ -- `Prepare a model <#prepare-a-model>`__ -- `Run the server <#run-the-server>`__ -- `Send a client request <#send-a-client-request>`__ -- `Stop the server <#stop-the-server>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("localai.ipynb") - -Prepare Docker --------------- - -Install `Docker -Engine `__, including its -`post-installation `__ -steps, on your development system. To verify installation, test it, -using the following command. When it is ready, it will display a test -image and a message. - -.. code:: ipython3 - - !docker run hello-world - - -.. parsed-literal:: - - - Hello from Docker! - This message shows that your installation appears to be working correctly. - - To generate this message, Docker took the following steps: - 1. The Docker client contacted the Docker daemon. - 2. The Docker daemon pulled the "hello-world" image from the Docker Hub. - (amd64) - 3. The Docker daemon created a new container from that image which runs the - executable that produces the output you are currently reading. - 4. The Docker daemon streamed that output to the Docker client, which sent it - to your terminal. - - To try something more ambitious, you can run an Ubuntu container with: - $ docker run -it ubuntu bash - - Share images, automate workflows, and more with a free Docker ID: - https://hub.docker.com/ - - For more examples and ideas, visit: - https://docs.docker.com/get-started/ - - - -Prepare a model -~~~~~~~~~~~~~~~ - - - -LocalAI allows to use customized models. For more details you can read -the -`instruction `__ -where you can also find the detailed documentation. We will use one of -the OpenVINO optimized LLMs in the collection on the `collection on -🤗Hugging -Face `__. -In this example we will use -`TinyLlama-1.1B-Chat-v1.0-fp16-ov `__. -First of all we should create a model configuration file: - -.. code:: yaml - - name: TinyLlama-1.1B-Chat-v1.0-fp16-ov - backend: transformers - parameters: - model: OpenVINO/TinyLlama-1.1B-Chat-v1.0-fp16-ov - temperature: 0.2 - top_k: 40 - top_p: 0.95 - max_new_tokens: 32 - - type: OVModelForCausalLM - - template: - chat_message: | - <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}} - {{if .Content}}{{.Content}}{{end}}<|im_end|> - chat: | - {{.Input}} - <|im_start|>assistant - - completion: | - {{.Input}} - - stopwords: - - <|im_end|> - -The fields ``backend``, ``model``, ``type`` you can find in the code -example on the model page (we added the corresponding comments): - -.. code:: python - - from transformers import AutoTokenizer # backend - from optimum.intel.openvino import OVModelForCausalLM # type - - model_id = "OpenVINO/TinyLlama-1.1B-Chat-v1.0-fp16-ov" # parameters.model - tokenizer = AutoTokenizer.from_pretrained(model_id) - model = OVModelForCausalLM.from_pretrained(model_id) - -The name you can choose by yourself. By this name you will specify what -model to use on the client side. - -You can create a GitHub gist and modify fields: -`ov.yaml `__ - -Description of the parameters used in config YAML file can be found -`here `__. - -The most important: - -- ``name`` - model name, used to identify the model in API calls. -- ``backend`` - backend to use for computation (like llama-cpp, - diffusers, whisper, transformers). -- ``parameters.model`` - relative to the models path. -- ``temperature``, ``top_k``, ``top_p``, ``max_new_tokens`` - - parameters for the model. -- ``type`` - type of configuration, often related to the type of task - or model architecture. -- ``template`` - templates for various types of model interactions. -- ``stopwords`` - Words or phrases that halts processing. - -Run the server -~~~~~~~~~~~~~~ - - - -Everything is ready for launch. Use -``quay.io/go-skynet/local-ai:v2.23.0-ffmpeg`` image that contains all -required dependencies. For more details read `Run with container -images `__. -If you want to see the output remove the ``-d`` flag and send a client -request from a separate notebook. - -.. code:: ipython3 - - !docker run -d --rm --name="localai" -p 8080:8080 quay.io/go-skynet/local-ai:master-sycl-f16-ffmpeg https://gist.githubusercontent.com/aleksandr-mokrov/f007c8fa6036760a856ddc60f605a0b0/raw/9d24ceeb487f9c058a943113bd0290e8ae565b3e/ov.yaml - - -.. parsed-literal:: - - 471a3e7e745080f9d62b3b4791103efedec7dabb43f2d6c7ca57b1b6023aa823 - docker: Error response from daemon: failed to create task for container: failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: error during container init: error running hook #1: error running hook: exit status 1, stdout: , stderr: Auto-detected mode as 'legacy' - nvidia-container-cli: requirement error: invalid expression: unknown. - - -Check whether the ``localai`` container is running normally: - -.. code:: ipython3 - - !docker ps | grep localai - -Send a client request -~~~~~~~~~~~~~~~~~~~~~ - - - -Now you can send HTTP requests using the model name -``TinyLlama-1.1B-Chat-v1.0-fp16-ov``. More details how to use `OpenAI -API `__. - -.. code:: ipython3 - - !curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{"model": "TinyLlama-1.1B-Chat-v1.0-fp16-ov", "prompt": "What is OpenVINO?"}' - - -.. parsed-literal:: - - curl: (7) Failed to connect to localhost port 8080: Connection refused - - -Stop the server -~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - !docker stop localai - - -.. parsed-literal:: - - Error response from daemon: No such container: localai - diff --git a/docs/notebooks/ltx-video-with-output.rst b/docs/notebooks/ltx-video-with-output.rst deleted file mode 100644 index 54d599aa3a825f..00000000000000 --- a/docs/notebooks/ltx-video-with-output.rst +++ /dev/null @@ -1,502 +0,0 @@ -LTX Video and OpenVINO™ -======================= - -`LTX-Video `__ is a -transformer-based latent diffusion model that adopts a holistic approach -to video generation by seamlessly integrating the responsibilities of -the Video-VAE and the denoising transformer. Unlike existing methods, -which treat these components as independent, LTX-Video aims to optimize -their interaction for improved efficiency and quality. At its core is a -carefully designed Video-VAE that achieves a high compression ratio of -1:192, with spatiotemporal downscaling of 32×32×8 pixels per token, -enabled by relocating the patchifying operation from the transformer’s -input to the VAE’s input. Operating in this highly compressed latent -space enables the transformer to efficiently perform full spatiotemporal -selfattention, which is essential for generating high-resolution videos -with temporal consistency. However, the high compression inherently -limits the representation of fine details. To address this, this VAE -decoder is tasked with both latent-to-pixel conversion and the final -denoising step, producing the clean result directly in pixel space. This -approach preserves the ability to generate fine details without -incurring the runtime cost of a separate upsampling module. The model -supports diverse use cases, including text-to-video and image-to-video -generation, with both capabilities trained simultaneously. - -In this example we show how to convert text-to-video pipeline in -OpenVINO format and run inference. For reducing memory consumption, -weights compression optimization can be applied using -`NNCF `__. - -+-----------------+-----------------+-----------------+-----------------+ -| |example1|\ | |example2|\ | |example3|\ | |example4|\ | -+-----------------+-----------------+-----------------+-----------------+ -| |example5|\ | |example6|\ | |example7|\ | |example8|\ | -+-----------------+-----------------+-----------------+-----------------+ -| |example9|\ | |example10|\ | |example11|\ | |example12|\ | -+-----------------+-----------------+-----------------+-----------------+ -| |example13|\ | |example14|\ | |example15|\ | |example16|\ | -+-----------------+-----------------+-----------------+-----------------+ - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Load and run the original model <#load-the-original-model>`__ -- `Convert the model to OpenVINO - IR <#convert-the-model-to-openvino-ir>`__ -- `Compiling the model <#compiling-the-model>`__ -- `Run OpenVINO model inference <#run-openvino-model-inference>`__ -- `Interactive inference <#interactive-inference>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. |example1| image:: https://media.githubusercontent.com/media/Lightricks/LTX-Video/refs/heads/main/docs/_static/ltx-video_example_00001.gif -.. |example2| image:: https://media.githubusercontent.com/media/Lightricks/LTX-Video/refs/heads/main//docs/_static/ltx-video_example_00002.gif -.. |example3| image:: https://media.githubusercontent.com/media/Lightricks/LTX-Video/refs/heads/main//docs/_static/ltx-video_example_00003.gif -.. |example4| image:: https://media.githubusercontent.com/media/Lightricks/LTX-Video/refs/heads/main//docs/_static/ltx-video_example_00004.gif -.. |example5| image:: https://media.githubusercontent.com/media/Lightricks/LTX-Video/refs/heads/main//docs/_static/ltx-video_example_00005.gif -.. |example6| image:: https://media.githubusercontent.com/media/Lightricks/LTX-Video/refs/heads/main//docs/_static/ltx-video_example_00006.gif -.. |example7| image:: https://media.githubusercontent.com/media/Lightricks/LTX-Video/refs/heads/main//docs/_static/ltx-video_example_00007.gif -.. |example8| image:: https://media.githubusercontent.com/media/Lightricks/LTX-Video/refs/heads/main//docs/_static/ltx-video_example_00008.gif -.. |example9| image:: https://media.githubusercontent.com/media/Lightricks/LTX-Video/refs/heads/main//docs/_static/ltx-video_example_00009.gif -.. |example10| image:: https://media.githubusercontent.com/media/Lightricks/LTX-Video/refs/heads/main//docs/_static/ltx-video_example_00010.gif -.. |example11| image:: https://media.githubusercontent.com/media/Lightricks/LTX-Video/refs/heads/main//docs/_static/ltx-video_example_00011.gif -.. |example12| image:: https://media.githubusercontent.com/media/Lightricks/LTX-Video/refs/heads/main//docs/_static/ltx-video_example_00012.gif -.. |example13| image:: https://media.githubusercontent.com/media/Lightricks/LTX-Video/refs/heads/main//docs/_static/ltx-video_example_00013.gif -.. |example14| image:: https://media.githubusercontent.com/media/Lightricks/LTX-Video/refs/heads/main//docs/_static/ltx-video_example_00014.gif -.. |example15| image:: https://media.githubusercontent.com/media/Lightricks/LTX-Video/refs/heads/main//docs/_static/ltx-video_example_00015.gif -.. |example16| image:: https://media.githubusercontent.com/media/Lightricks/LTX-Video/refs/heads/main//docs/_static/ltx-video_example_00016.gif - -Prerequisites -------------- - - - -.. code:: ipython3 - - from pathlib import Path - import requests - - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - - if not Path("ov_ltx_video_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/ltx-video/ov_ltx_video_helper.py", - ) - open("ov_ltx_video_helper.py", "w").write(r.text) - - - %pip install -qU "torch>=2.1.0" "torchvision>=0.16" "diffusers>=0.28.2" "transformers>=4.44.2" "sentencepiece>=0.1.96" "huggingface-hub~=0.25.2" "einops" "accelerate" "matplotlib" "imageio[ffmpeg]" "nncf>=2.14" "gradio>=4.26" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install --pre -qU "openvino>=2024.6.0" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("ltx-video.ipynb") - -Load the original model -~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - import torch - from diffusers import LTXPipeline - from diffusers.utils import export_to_video - - pipe = LTXPipeline.from_pretrained("Lightricks/LTX-Video") - - - -.. parsed-literal:: - - Loading pipeline components...: 0%| | 0/5 [00:00`__ -should be used for these purposes. ``ov.convert_model`` function accepts -original PyTorch model instance and example input for tracing and -returns ``ov.Model`` representing this model in OpenVINO framework. -Converted model can be used for saving on disk using ``ov.save_model`` -function or directly loading on device using ``core.complie_model``. - -``ov_ltx_video_helper.py`` script contains helper function for models -downloading and models conversion, please check its content if you -interested in conversion details. Note that we delete the original -models after conversion from pipeline to free memory. - -LTX Video text-to-video pipeline consists of 3 models: ``Text Encoder`` -converts input text into embeddings, ``Transformer`` processes these -embeddings to generate latents from noise step by step, ``VAEDecoder`` -performs the last denoising step in conjunction with converting latents -to pixels. - -For reducing memory consumption, weights compression optimization can be -applied using `NNCF `__. Weight -compression aims to reduce the memory footprint of a model. models, -which require extensive memory to store the weights during inference, -can benefit from weight compression in the following ways: - -- enabling the inference of exceptionally large models that cannot be - accommodated in the memory of the device; - -- improving the inference performance of the models by reducing the - latency of the memory access when computing the operations with - weights, for example, Linear layers. - -`Neural Network Compression Framework -(NNCF) `__ provides 4-bit / -8-bit mixed weight quantization as a compression method. The main -difference between weights compression and full model quantization -(post-training quantization) is that activations remain floating-point -in the case of weights compression which leads to a better accuracy. In -addition, weight compression is data-free and does not require a -calibration dataset, making it easy to use. - -``nncf.compress_weights`` function can be used for performing weights -compression. The function accepts an OpenVINO model and other -compression parameters. - -More details about weights compression can be found in `OpenVINO -documentation `__. - -.. code:: ipython3 - - import ipywidgets as widgets - - - to_compress_weights = widgets.Checkbox( - value=True, - description="Apply Weight Compression", - disabled=False, - ) - - to_compress_weights - -.. code:: ipython3 - - from ov_ltx_video_helper import convert_text_encoder, convert_transformer, convert_vae_decoder - - - # Uncomment line below to see model conversion code (replace to convert_transformer and convert_vae_decoder to see them too) - # convert_text_encoder?? - -.. code:: ipython3 - - import gc - - - TEXT_ENCODER_PATH = Path("models/text_encoder_ir.xml") - TRANSFORMER_OV_PATH = Path("models/transformer_ir.xml") - VAE_DECODER_PATH = Path("models/vae_ir.xml") - - text_encoder_dtype = pipe.text_encoder.dtype - transformer_config = pipe.transformer.config - vae_config = pipe.vae.config - vae_latents_mean = pipe.vae.latents_mean - vae_latents_std = pipe.vae.latents_std - - convert_text_encoder(pipe.text_encoder, TEXT_ENCODER_PATH, to_compress_weights.value) - del pipe.text_encoder - convert_transformer(pipe.transformer, TRANSFORMER_OV_PATH, to_compress_weights.value) - del pipe.transformer - convert_vae_decoder(pipe.vae, VAE_DECODER_PATH, to_compress_weights.value) - del pipe.vae - gc.collect() - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, openvino - ⌛ text_encoder conversion started - - -.. parsed-literal:: - - `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. - /usr/lib/python3.10/importlib/util.py:247: DeprecationWarning: The `openvino.runtime` module is deprecated and will be removed in the 2026.0 release. Please replace `openvino.runtime` with `openvino`. - self.__spec__.loader.exec_module(self) - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Weight compression mode │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ int8_asym │ 100% (170 / 170) │ 100% (170 / 170) │ - ┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - - - -.. parsed-literal:: - - Output() - - - - - - - - - -.. parsed-literal:: - - ✅ text_encoder model conversion finished - ⌛ transformer conversion started - - -.. parsed-literal:: - - /home/maleksandr/test_notebooks/ltx-video/openvino_notebooks/notebooks/venv-ltx/lib/python3.10/site-packages/diffusers/models/attention_processor.py:711: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if current_length != target_length: - /home/maleksandr/test_notebooks/ltx-video/openvino_notebooks/notebooks/venv-ltx/lib/python3.10/site-packages/diffusers/models/attention_processor.py:726: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if attention_mask.shape[0] < batch_size * head_size: - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Weight compression mode │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ int8_asym │ 100% (287 / 287) │ 100% (287 / 287) │ - ┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - - - -.. parsed-literal:: - - Output() - - - - - - - - - -.. parsed-literal:: - - ✅ transformer model conversion finished - ⌛ vae conversion started - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Weight compression mode │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ int8_asym │ 100% (45 / 45) │ 100% (45 / 45) │ - ┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - - - -.. parsed-literal:: - - Output() - - - - - - - - - -.. parsed-literal:: - - ✅ vae model conversion finished - - -Compiling the model -~~~~~~~~~~~~~~~~~~~ - - - -Select device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - import openvino as ov - - from notebook_utils import device_widget - - - core = ov.Core() - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -``ov_catvton_helper.py`` provides wrapper classes that wrap the compiled -models to keep the original interface. Note that all of wrapper classes -return ``torch.Tensor``\ s instead of ``np.array``\ s. Then we insert -wrappers instances in the pipeline. - -.. code:: ipython3 - - from ov_ltx_video_helper import TextEncoderWrapper, ConvTransformerWrapper, VAEWrapper - - - # Uncomment the line below to see the wrapper class code (replace to ConvTransformerWrapper and VAEWrapper to see them too) - # TextEncoderWrapper?? - -.. code:: ipython3 - - compiled_transformer = core.compile_model(TRANSFORMER_OV_PATH, device.value) - compiled_vae = core.compile_model(VAE_DECODER_PATH, device.value) - compiled_text_encoder = core.compile_model(TEXT_ENCODER_PATH, device.value) - - pipe.__dict__["_internal_dict"]["_execution_device"] = pipe._execution_device # this is to avoid some problem that can occur in the pipeline - - pipe.register_modules( - text_encoder=TextEncoderWrapper(compiled_text_encoder, text_encoder_dtype), - transformer=ConvTransformerWrapper(compiled_transformer, transformer_config), - vae=VAEWrapper(compiled_vae, vae_config, vae_latents_mean, vae_latents_std), - ) - -Run OpenVINO model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -`General -tips `__: - -- The model works on resolutions that are divisible by 32 and number of - frames that are divisible by 8 + 1 (e.g. 257). In case the input is not - satisfied to the described conditions, the input will be padded with -1 - and then cropped to the desired resolution and number of frames. -- The model works best on resolutions under 720 x 1280 and number of frames - below 257. -- Prompts should be in English. The more elaborate the - better. Good prompt looks like The turquoise waves crash against the - dark, jagged rocks of the shore, sending white foam spraying into the - air. The scene is dominated by the stark contrast between the bright - blue water and the dark, almost black rocks. The water is a clear, - turquoise color, and the waves are capped with white foam. The rocks are - dark and jagged, and they are covered in patches of green moss. The - shore is lined with lush green vegetation, including trees and bushes. - In the background, there are rolling hills covered in dense forest. The - sky is cloudy, and the light is dim. - -.. code:: ipython3 - - prompt = "A clear, turquoise river flows through a rocky canyon, cascading over a small waterfall and forming a pool of water at the bottom.The river is the main focus of the scene, with its clear water reflecting the surrounding trees and rocks. The canyon walls are steep and rocky, with some vegetation growing on them. The trees are mostly pine trees, with their green needles contrasting with the brown and gray rocks. The overall tone of the scene is one of peace and tranquility." - negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted" - generator = torch.Generator(device="cpu").manual_seed(42) - - - video = pipe( - prompt=prompt, - negative_prompt=negative_prompt, - width=704, - height=480, - num_frames=24, - num_inference_steps=40, - generator=generator, - ).frames[0] - export_to_video(video, "output1_ov.mp4", fps=24) - - -.. parsed-literal:: - - /home/maleksandr/test_notebooks/ltx-video/openvino_notebooks/notebooks/venv-ltx/lib/python3.10/site-packages/diffusers/configuration_utils.py:140: FutureWarning: Accessing config attribute `_execution_device` directly via 'LTXPipeline' object attribute is deprecated. Please access '_execution_device' over 'LTXPipeline's config object instead, e.g. 'scheduler.config._execution_device'. - deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False) - - - -.. parsed-literal:: - - 0%| | 0/40 [00:00`__ and `Google’s -blog -post `__. - -In this tutorial we consider how to bring OpenVINO power into Magika. - - -**Table of contents:** - -- `Prerequisites <#prerequisites>`__ -- `Define model loading class <#define-model-loading-class>`__ -- `Run OpenVINO model inference <#run-openvino-model-inference>`__ - - - `Select device <#select-device>`__ - - `Create model <#create-model>`__ - - `Run inference on bytes input <#run-inference-on-bytes-input>`__ - - `Run inference on file input <#run-inference-on-file-input>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q magika "openvino>=2024.1.0" "gradio>=4.19" - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - - -Define model loading class --------------------------- - - - -At inference time Magika uses ONNX as an inference engine to ensure -files are identified in a matter of milliseconds, almost as fast as a -non-AI tool even on CPU. The code below extending original Magika -inference class with OpenVINO API. The provided code is fully compatible -with original `Magika Python -API `__. - -.. code:: ipython3 - - import time - from pathlib import Path - from functools import partial - from typing import List, Tuple, Optional, Dict - - from magika import Magika - from magika.types import ModelFeatures, ModelOutput, MagikaResult - from magika.prediction_mode import PredictionMode - import numpy.typing as npt - import numpy as np - - import openvino as ov - - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("magika-content-type-recognition.ipynb") - - - class OVMagika(Magika): - def __init__( - self, - model_dir: Optional[Path] = None, - prediction_mode: PredictionMode = PredictionMode.HIGH_CONFIDENCE, - no_dereference: bool = False, - verbose: bool = False, - debug: bool = False, - use_colors: bool = False, - device="CPU", - ) -> None: - self._device = device - super().__init__(model_dir, prediction_mode, no_dereference, verbose, debug, use_colors) - - def _init_onnx_session(self): - # overload model loading using OpenVINO - start_time = time.time() - core = ov.Core() - ov_model = core.compile_model(self._model_path, self._device.upper()) - elapsed_time = 1000 * (time.time() - start_time) - self._log.debug(f'ONNX DL model "{self._model_path}" loaded in {elapsed_time:.03f} ms on {self._device}') - return ov_model - - def _get_raw_predictions(self, features: List[Tuple[Path, ModelFeatures]]) -> npt.NDArray: - """ - Given a list of (path, features), return a (files_num, features_size) - matrix encoding the predictions. - """ - - dataset_format = self._model_config["train_dataset_info"]["dataset_format"] - assert dataset_format == "int-concat/one-hot" - start_time = time.time() - X_bytes = [] - for _, fs in features: - sample_bytes = [] - if self._input_sizes["beg"] > 0: - sample_bytes.extend(fs.beg[: self._input_sizes["beg"]]) - if self._input_sizes["mid"] > 0: - sample_bytes.extend(fs.mid[: self._input_sizes["mid"]]) - if self._input_sizes["end"] > 0: - sample_bytes.extend(fs.end[-self._input_sizes["end"] :]) - X_bytes.append(sample_bytes) - X = np.array(X_bytes).astype(np.float32) - elapsed_time = time.time() - start_time - self._log.debug(f"DL input prepared in {elapsed_time:.03f} seconds") - - start_time = time.time() - raw_predictions_list = [] - samples_num = X.shape[0] - - max_internal_batch_size = 1000 - batches_num = samples_num // max_internal_batch_size - if samples_num % max_internal_batch_size != 0: - batches_num += 1 - - for batch_idx in range(batches_num): - self._log.debug(f"Getting raw predictions for (internal) batch {batch_idx+1}/{batches_num}") - start_idx = batch_idx * max_internal_batch_size - end_idx = min((batch_idx + 1) * max_internal_batch_size, samples_num) - batch_raw_predictions = self._onnx_session({"bytes": X[start_idx:end_idx, :]})["target_label"] - raw_predictions_list.append(batch_raw_predictions) - elapsed_time = time.time() - start_time - self._log.debug(f"DL raw prediction in {elapsed_time:.03f} seconds") - return np.concatenate(raw_predictions_list) - - def _get_topk_model_outputs_from_features(self, all_features: List[Tuple[Path, ModelFeatures]], k: int = 5) -> List[Tuple[Path, List[ModelOutput]]]: - """ - Helper function for getting top k the highest ranked model results for each feature - """ - raw_preds = self._get_raw_predictions(all_features) - top_preds_idxs = np.argsort(raw_preds, axis=1)[:, -k:][:, ::-1] - scores = [raw_preds[i, idx] for i, idx in enumerate(top_preds_idxs)] - results = [] - for (path, _), scores, top_idxes in zip(all_features, raw_preds, top_preds_idxs): - model_outputs_for_path = [] - for idx in top_idxes: - ct_label = self._target_labels_space_np[idx] - score = scores[idx] - model_outputs_for_path.append(ModelOutput(ct_label=ct_label, score=float(score))) - results.append((path, model_outputs_for_path)) - return results - - def _get_results_from_features_topk(self, all_features: List[Tuple[Path, ModelFeatures]], top_k=5) -> Dict[str, MagikaResult]: - """ - Helper function for getting top k the highest ranked model results for each feature - """ - # We now do inference for those files that need it. - - if len(all_features) == 0: - # nothing to be done - return {} - - outputs: Dict[str, MagikaResult] = {} - - for path, model_output in self._get_topk_model_outputs_from_features(all_features, top_k): - # In additional to the content type label from the DL model, we - # also allow for other logic to overwrite such result. For - # debugging and information purposes, the JSON output stores - # both the raw DL model output and the final output we return to - # the user. - results = [] - for out in model_output: - output_ct_label = self._get_output_ct_label_from_dl_result(out.ct_label, out.score) - - results.append( - self._get_result_from_labels_and_score( - path, - dl_ct_label=out.ct_label, - output_ct_label=output_ct_label, - score=out.score, - ) - ) - outputs[str(path)] = results - - return outputs - - def identify_bytes_topk(self, content: bytes, top_k=5) -> MagikaResult: - # Helper function for getting topk results from bytes - _get_results_from_features = self._get_results_from_features - self._get_results_from_features = partial(self._get_results_from_features_topk, top_k=top_k) - result = super().identify_bytes(content) - self._get_results_from_features = _get_results_from_features - return result - -Run OpenVINO model inference ----------------------------- - - - -Now let’s check model inference result. - -Select device -~~~~~~~~~~~~~ - - - -For starting work, please, select one of represented devices from -dropdown list. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Create model -~~~~~~~~~~~~ - - - -As we discussed above, our OpenVINO extended ``OVMagika`` class has the -same API like original one. Let’s try to create interface instance and -launch it on different input formats - -.. code:: ipython3 - - ov_magika = OVMagika(device=device.value) - -Run inference on bytes input -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - result = ov_magika.identify_bytes(b"# Example\nThis is an example of markdown!") - print(f"Content type: {result.output.ct_label} - {result.output.score * 100:.4}%") - - -.. parsed-literal:: - - Content type: markdown - 99.29% - - -Run inference on file input -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - import requests - - input_file = Path("./README.md") - if not input_file.exists(): - r = requests.get("https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/README.md") - with open("README.md", "w") as f: - f.write(r.text) - result = ov_magika.identify_path(input_file) - print(f"Content type: {result.output.ct_label} - {result.output.score * 100:.4}%") - - -.. parsed-literal:: - - Content type: markdown - 100.0% - - -Interactive demo ----------------- - - - -Now, you can try model on own files. Upload file into input file window, -click submit button and look on predicted file types. - -.. code:: ipython3 - - import gradio as gr - - - def classify(file_path): - """Classify file using classes listing. - Args: - file_path): path to input file - Returns: - (dict): Mapping between class labels and class probabilities. - """ - results = ov_magika.identify_bytes_topk(file_path) - - return {result.dl.ct_label: float(result.output.score) for result in results} - - - demo = gr.Interface( - fn=classify, - inputs=[ - gr.File(label="Input file", type="binary"), - ], - outputs=gr.Label(label="Result"), - examples=[["./README.md"]], - allow_flagging="never", - ) - try: - demo.launch(debug=False) - except Exception: - demo.launch(share=True, debug=False) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ - - -.. parsed-literal:: - - Running on local URL: http://127.0.0.1:7860 - - To create a public link, set `share=True` in `launch()`. - - - - - - - diff --git a/docs/notebooks/meter-reader-with-output.rst b/docs/notebooks/meter-reader-with-output.rst deleted file mode 100644 index 053b4b3ae9457a..00000000000000 --- a/docs/notebooks/meter-reader-with-output.rst +++ /dev/null @@ -1,832 +0,0 @@ -Industrial Meter Reader -======================= - -This notebook shows how to create a industrial meter reader with -OpenVINO Runtime. We use the pre-trained -`PPYOLOv2 `__ -PaddlePaddle model and -`DeepLabV3P `__ -to build up a multiple inference task pipeline: - -1. Run detection model to find the meters, and crop them from the origin - photo. -2. Run segmentation model on these cropped meters to get the pointer and - scale instance. -3. Find the location of the pointer in scale map. - -.. figure:: https://user-images.githubusercontent.com/91237924/166137115-67284fa5-f703-4468-98f4-c43d2c584763.png - :alt: workflow - - workflow - - -**Table of contents:** - - -- `Import <#import>`__ -- `Prepare the Model and Test - Image <#prepare-the-model-and-test-image>`__ -- `Configuration <#configuration>`__ -- `Load the Models <#load-the-models>`__ -- `Data Process <#data-process>`__ -- `Main Function <#main-function>`__ - - - `Initialize the model and - parameters. <#initialize-the-model-and-parameters->`__ - - `Run meter detection model <#run-meter-detection-model>`__ - - `Run meter segmentation model <#run-meter-segmentation-model>`__ - - `Postprocess the models result and calculate the final - readings <#postprocess-the-models-result-and-calculate-the-final-readings>`__ - - `Get the reading result on the meter - picture <#get-the-reading-result-on-the-meter-picture>`__ - -- `Try it with your meter photos! <#try-it-with-your-meter-photos>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - # Install openvino package - %pip install -q "openvino>=2023.1.0" opencv-python tqdm "matplotlib>=3.4" - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - - -Import ------- - - - -.. code:: ipython3 - - import os - from pathlib import Path - import numpy as np - import math - import cv2 - import tarfile - import matplotlib.pyplot as plt - import openvino as ov - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file, segmentation_map_to_image, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("meter-reader.ipynb") - -Prepare the Model and Test Image --------------------------------- - -Download PPYOLOv2 and -DeepLabV3P pre-trained models from PaddlePaddle community. - -.. code:: ipython3 - - MODEL_DIR = "model" - DATA_DIR = "data" - DET_MODEL_LINK = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/meter-reader/meter_det_model.tar.gz" - SEG_MODEL_LINK = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/meter-reader/meter_seg_model.tar.gz" - DET_FILE_NAME = DET_MODEL_LINK.split("/")[-1] - SEG_FILE_NAME = SEG_MODEL_LINK.split("/")[-1] - IMG_LINK = "https://user-images.githubusercontent.com/91237924/170696219-f68699c6-1e82-46bf-aaed-8e2fc3fa5f7b.jpg" - IMG_FILE_NAME = IMG_LINK.split("/")[-1] - IMG_PATH = Path(f"{DATA_DIR}/{IMG_FILE_NAME}") - - os.makedirs(MODEL_DIR, exist_ok=True) - - download_file(DET_MODEL_LINK, directory=MODEL_DIR, show_progress=True) - file = tarfile.open(f"model/{DET_FILE_NAME}") - res = file.extractall("model") - if not res: - print(f'Detection Model Extracted to "./{MODEL_DIR}".') - else: - print("Error Extracting the Detection model. Please check the network.") - - download_file(SEG_MODEL_LINK, directory=MODEL_DIR, show_progress=True) - file = tarfile.open(f"model/{SEG_FILE_NAME}") - res = file.extractall("model") - if not res: - print(f'Segmentation Model Extracted to "./{MODEL_DIR}".') - else: - print("Error Extracting the Segmentation model. Please check the network.") - - download_file(IMG_LINK, directory=DATA_DIR, show_progress=True) - if IMG_PATH.is_file(): - print(f'Test Image Saved to "./{DATA_DIR}".') - else: - print("Error Downloading the Test Image. Please check the network.") - - - -.. parsed-literal:: - - meter_det_model.tar.gz: 0%| | 0.00/192M [00:00 score_threshold: - filtered_results.append(det_results[i]) - return filtered_results - - - def roi_crop(image, results, scale_x, scale_y): - """ - Crop the area of detected meter of original image - - Param: - img (np.array):original image。 - det_results (list[dict]): detection results - scale_x (float): the scale value in x axis - scale_y (float): the scale value in y axis - - Retuns: - roi_imgs (list[np.array]): the list of meter images - loc (list[int]): the list of meter locations - - """ - roi_imgs = [] - loc = [] - for result in results: - bbox = result[2:] - xmin, ymin, xmax, ymax = [ - int(bbox[0] * scale_x), - int(bbox[1] * scale_y), - int(bbox[2] * scale_x), - int(bbox[3] * scale_y), - ] - sub_img = image[ymin : (ymax + 1), xmin : (xmax + 1), :] - roi_imgs.append(sub_img) - loc.append([xmin, ymin, xmax, ymax]) - return roi_imgs, loc - - - def roi_process(input_images, target_size, interp=cv2.INTER_LINEAR): - """ - Prepare the roi image of detection results data - Preprocessing the input data for segmentation task - - Param: - input_images (list[np.array]):the list of meter images - target_size (list|tuple): height and width of resized image, e.g [heigh,width] - interp (int):the interp method for image reszing - - Retuns: - img_list (list[np.array]):the list of processed images - resize_img (list[np.array]): for visualization - - """ - img_list = list() - resize_list = list() - for img in input_images: - img_shape = img.shape - scale_x = float(target_size[1]) / float(img_shape[1]) - scale_y = float(target_size[0]) / float(img_shape[0]) - resize_img = cv2.resize(img, None, None, fx=scale_x, fy=scale_y, interpolation=interp) - resize_list.append(resize_img) - resize_img = resize_img.transpose(2, 0, 1) / 255 - img_mean = np.array([0.5, 0.5, 0.5]).reshape((3, 1, 1)) - img_std = np.array([0.5, 0.5, 0.5]).reshape((3, 1, 1)) - resize_img -= img_mean - resize_img /= img_std - img_list.append(resize_img) - return img_list, resize_list - - - def erode(seg_results, erode_kernel): - """ - Erode the segmentation result to get the more clear instance of pointer and scale - - Param: - seg_results (list[dict]):segmentation results - erode_kernel (int): size of erode_kernel - - Return: - eroded_results (list[dict]): the lab map of eroded_results - - """ - kernel = np.ones((erode_kernel, erode_kernel), np.uint8) - eroded_results = seg_results - for i in range(len(seg_results)): - eroded_results[i] = cv2.erode(seg_results[i].astype(np.uint8), kernel) - return eroded_results - - - def circle_to_rectangle(seg_results): - """ - Switch the shape of label_map from circle to rectangle - - Param: - seg_results (list[dict]):segmentation results - - Return: - rectangle_meters (list[np.array]):the rectangle of label map - - """ - rectangle_meters = list() - for i, seg_result in enumerate(seg_results): - label_map = seg_result - - # The size of rectangle_meter is determined by RECTANGLE_HEIGHT and RECTANGLE_WIDTH - rectangle_meter = np.zeros((RECTANGLE_HEIGHT, RECTANGLE_WIDTH), dtype=np.uint8) - for row in range(RECTANGLE_HEIGHT): - for col in range(RECTANGLE_WIDTH): - theta = PI * 2 * (col + 1) / RECTANGLE_WIDTH - - # The radius of meter circle will be mapped to the height of rectangle image - rho = CIRCLE_RADIUS - row - 1 - y = int(CIRCLE_CENTER[0] + rho * math.cos(theta) + 0.5) - x = int(CIRCLE_CENTER[1] - rho * math.sin(theta) + 0.5) - rectangle_meter[row, col] = label_map[y, x] - rectangle_meters.append(rectangle_meter) - return rectangle_meters - - - def rectangle_to_line(rectangle_meters): - """ - Switch the dimension of rectangle label map from 2D to 1D - - Param: - rectangle_meters (list[np.array]):2D rectangle OF label_map。 - - Return: - line_scales (list[np.array]): the list of scales value - line_pointers (list[np.array]):the list of pointers value - - """ - line_scales = list() - line_pointers = list() - for rectangle_meter in rectangle_meters: - height, width = rectangle_meter.shape[0:2] - line_scale = np.zeros((width), dtype=np.uint8) - line_pointer = np.zeros((width), dtype=np.uint8) - for col in range(width): - for row in range(height): - if rectangle_meter[row, col] == SEG_LABEL["pointer"]: - line_pointer[col] += 1 - elif rectangle_meter[row, col] == SEG_LABEL["scale"]: - line_scale[col] += 1 - line_scales.append(line_scale) - line_pointers.append(line_pointer) - return line_scales, line_pointers - - - def mean_binarization(data_list): - """ - Binarize the data - - Param: - data_list (list[np.array]):input data - - Return: - binaried_data_list (list[np.array]):output data。 - - """ - batch_size = len(data_list) - binaried_data_list = data_list - for i in range(batch_size): - mean_data = np.mean(data_list[i]) - width = data_list[i].shape[0] - for col in range(width): - if data_list[i][col] < mean_data: - binaried_data_list[i][col] = 0 - else: - binaried_data_list[i][col] = 1 - return binaried_data_list - - - def locate_scale(line_scales): - """ - Find location of center of each scale - - Param: - line_scales (list[np.array]):the list of binaried scales value - - Return: - scale_locations (list[list]):location of each scale - - """ - batch_size = len(line_scales) - scale_locations = list() - for i in range(batch_size): - line_scale = line_scales[i] - width = line_scale.shape[0] - find_start = False - one_scale_start = 0 - one_scale_end = 0 - locations = list() - for j in range(width - 1): - if line_scale[j] > 0 and line_scale[j + 1] > 0: - if not find_start: - one_scale_start = j - find_start = True - if find_start: - if line_scale[j] == 0 and line_scale[j + 1] == 0: - one_scale_end = j - 1 - one_scale_location = (one_scale_start + one_scale_end) / 2 - locations.append(one_scale_location) - one_scale_start = 0 - one_scale_end = 0 - find_start = False - scale_locations.append(locations) - return scale_locations - - - def locate_pointer(line_pointers): - """ - Find location of center of pointer - - Param: - line_scales (list[np.array]):the list of binaried pointer value - - Return: - scale_locations (list[list]):location of pointer - - """ - batch_size = len(line_pointers) - pointer_locations = list() - for i in range(batch_size): - line_pointer = line_pointers[i] - find_start = False - pointer_start = 0 - pointer_end = 0 - location = 0 - width = line_pointer.shape[0] - for j in range(width - 1): - if line_pointer[j] > 0 and line_pointer[j + 1] > 0: - if not find_start: - pointer_start = j - find_start = True - if find_start: - if line_pointer[j] == 0 and line_pointer[j + 1] == 0: - pointer_end = j - 1 - location = (pointer_start + pointer_end) / 2 - find_start = False - break - pointer_locations.append(location) - return pointer_locations - - - def get_relative_location(scale_locations, pointer_locations): - """ - Match location of pointer and scales - - Param: - scale_locations (list[list]):location of each scale - pointer_locations (list[list]):location of pointer - - Return: - pointed_scales (list[dict]): a list of dict with: - 'num_scales': total number of scales - 'pointed_scale': predicted number of scales - - """ - pointed_scales = list() - for scale_location, pointer_location in zip(scale_locations, pointer_locations): - num_scales = len(scale_location) - pointed_scale = -1 - if num_scales > 0: - for i in range(num_scales - 1): - if scale_location[i] <= pointer_location < scale_location[i + 1]: - pointed_scale = i + (pointer_location - scale_location[i]) / (scale_location[i + 1] - scale_location[i] + 1e-05) + 1 - result = {"num_scales": num_scales, "pointed_scale": pointed_scale} - pointed_scales.append(result) - return pointed_scales - - - def calculate_reading(pointed_scales): - """ - Calculate the value of meter according to the type of meter - - Param: - pointed_scales (list[list]):predicted number of scales - - Return: - readings (list[float]): the list of values read from meter - - """ - readings = list() - batch_size = len(pointed_scales) - for i in range(batch_size): - pointed_scale = pointed_scales[i] - # find the type of meter according the total number of scales - if pointed_scale["num_scales"] > TYPE_THRESHOLD: - reading = pointed_scale["pointed_scale"] * METER_CONFIG[0]["scale_interval_value"] - else: - reading = pointed_scale["pointed_scale"] * METER_CONFIG[1]["scale_interval_value"] - readings.append(reading) - return readings - -Main Function -------------- - - - -Initialize the model and parameters. -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -The number of detected meter from detection network can be arbitrary in -some scenarios, which means the batch size of segmentation network input -is a `dynamic -dimension `__, -and it should be specified as ``-1`` or the ``ov::Dimension()`` instead -of a positive number used for static dimensions. In this case, for -memory consumption optimization, we can specify the lower and/or upper -bounds of input batch size. - -.. code:: ipython3 - - img_file = f"{DATA_DIR}/{IMG_FILE_NAME}" - det_model_path = f"{MODEL_DIR}/meter_det_model/model.pdmodel" - det_model_shape = { - "image": [1, 3, 608, 608], - "im_shape": [1, 2], - "scale_factor": [1, 2], - } - seg_model_path = f"{MODEL_DIR}/meter_seg_model/model.pdmodel" - seg_model_shape = {"image": [ov.Dimension(1, 2), 3, 512, 512]} - - erode_kernel = 4 - score_threshold = 0.5 - seg_batch_size = 2 - input_shape = 608 - - # Intialize the model objects - detector = Model(det_model_path, det_model_shape, device.value) - segmenter = Model(seg_model_path, seg_model_shape, device.value) - - # Visulize a original input photo - image = cv2.imread(img_file) - rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - plt.imshow(rgb_image) - - - - -.. parsed-literal:: - - - - - - -.. image:: meter-reader-with-output_files/meter-reader-with-output_16_1.png - - -Run meter detection model -~~~~~~~~~~~~~~~~~~~~~~~~~ - -Detect the location of the -meter and prepare the ROI images for segmentation. - -.. code:: ipython3 - - # Prepare the input data for meter detection model - im_shape = np.array([[input_shape, input_shape]]).astype("float32") - scale_factor = np.array([[1, 2]]).astype("float32") - input_image = det_preprocess(image, input_shape) - inputs_dict = {"image": input_image, "im_shape": im_shape, "scale_factor": scale_factor} - - # Run meter detection model - det_results = detector.predict(inputs_dict) - - # Filter out the bounding box with low confidence - filtered_results = filter_bboxes(det_results, score_threshold) - - # Prepare the input data for meter segmentation model - scale_x = image.shape[1] / input_shape * 2 - scale_y = image.shape[0] / input_shape - - # Create the individual picture for each detected meter - roi_imgs, loc = roi_crop(image, filtered_results, scale_x, scale_y) - roi_imgs, resize_imgs = roi_process(roi_imgs, METER_SHAPE) - - # Create the pictures of detection results - roi_stack = np.hstack(resize_imgs) - - if cv2.imwrite(f"{DATA_DIR}/detection_results.jpg", roi_stack): - print('The detection result image has been saved as "detection_results.jpg" in data') - plt.imshow(cv2.cvtColor(roi_stack, cv2.COLOR_BGR2RGB)) - - -.. parsed-literal:: - - The detection result image has been saved as "detection_results.jpg" in data - - - -.. image:: meter-reader-with-output_files/meter-reader-with-output_18_1.png - - -Run meter segmentation model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Get the results of segmentation -task on detected ROI. - -.. code:: ipython3 - - seg_results = list() - mask_list = list() - num_imgs = len(roi_imgs) - - # Run meter segmentation model on all detected meters - for i in range(0, num_imgs, seg_batch_size): - batch = roi_imgs[i : min(num_imgs, i + seg_batch_size)] - seg_result = segmenter.predict({"image": np.array(batch)}) - seg_results.extend(seg_result) - results = [] - for i in range(len(seg_results)): - results.append(np.argmax(seg_results[i], axis=0)) - seg_results = erode(results, erode_kernel) - - # Create the pictures of segmentation results - for i in range(len(seg_results)): - mask_list.append(segmentation_map_to_image(seg_results[i], COLORMAP)) - mask_stack = np.hstack(mask_list) - - if cv2.imwrite(f"{DATA_DIR}/segmentation_results.jpg", cv2.cvtColor(mask_stack, cv2.COLOR_RGB2BGR)): - print('The segmentation result image has been saved as "segmentation_results.jpg" in data') - plt.imshow(mask_stack) - - -.. parsed-literal:: - - The segmentation result image has been saved as "segmentation_results.jpg" in data - - - -.. image:: meter-reader-with-output_files/meter-reader-with-output_20_1.png - - -Postprocess the models result and calculate the final readings -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Use OpenCV function to find the -location of the pointer in a scale map. - -.. code:: ipython3 - - # Find the pointer location in scale map and calculate the meters reading - rectangle_meters = circle_to_rectangle(seg_results) - line_scales, line_pointers = rectangle_to_line(rectangle_meters) - binaried_scales = mean_binarization(line_scales) - binaried_pointers = mean_binarization(line_pointers) - scale_locations = locate_scale(binaried_scales) - pointer_locations = locate_pointer(binaried_pointers) - pointed_scales = get_relative_location(scale_locations, pointer_locations) - meter_readings = calculate_reading(pointed_scales) - - rectangle_list = list() - # Plot the rectangle meters - for i in range(len(rectangle_meters)): - rectangle_list.append(segmentation_map_to_image(rectangle_meters[i], COLORMAP)) - rectangle_meters_stack = np.hstack(rectangle_list) - - if cv2.imwrite( - f"{DATA_DIR}/rectangle_meters.jpg", - cv2.cvtColor(rectangle_meters_stack, cv2.COLOR_RGB2BGR), - ): - print('The rectangle_meters result image has been saved as "rectangle_meters.jpg" in data') - plt.imshow(rectangle_meters_stack) - - -.. parsed-literal:: - - The rectangle_meters result image has been saved as "rectangle_meters.jpg" in data - - - -.. image:: meter-reader-with-output_files/meter-reader-with-output_22_1.png - - -Get the reading result on the meter picture -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # Create a final result photo with reading - for i in range(len(meter_readings)): - print("Meter {}: {:.3f}".format(i + 1, meter_readings[i])) - - result_image = image.copy() - for i in range(len(loc)): - cv2.rectangle(result_image, (loc[i][0], loc[i][1]), (loc[i][2], loc[i][3]), (0, 150, 0), 3) - font = cv2.FONT_HERSHEY_SIMPLEX - cv2.rectangle( - result_image, - (loc[i][0], loc[i][1]), - (loc[i][0] + 100, loc[i][1] + 40), - (0, 150, 0), - -1, - ) - cv2.putText( - result_image, - "#{:.3f}".format(meter_readings[i]), - (loc[i][0], loc[i][1] + 25), - font, - 0.8, - (255, 255, 255), - 2, - cv2.LINE_AA, - ) - if cv2.imwrite(f"{DATA_DIR}/reading_results.jpg", result_image): - print('The reading results image has been saved as "reading_results.jpg" in data') - plt.imshow(cv2.cvtColor(result_image, cv2.COLOR_BGR2RGB)) - - -.. parsed-literal:: - - Meter 1: 1.100 - Meter 2: 6.185 - The reading results image has been saved as "reading_results.jpg" in data - - - -.. image:: meter-reader-with-output_files/meter-reader-with-output_24_1.png - - -Try it with your meter photos! ------------------------------- - - diff --git a/docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_16_1.png b/docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_16_1.png deleted file mode 100644 index 52a1b757cb6589..00000000000000 --- a/docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_16_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:08c5ae3bb47e095d707bdaa7f8008bed7eeb1f672c82ae4d63334e665ec3e4d8 -size 170121 diff --git a/docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_18_1.png b/docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_18_1.png deleted file mode 100644 index 7151cac5e2d0e8..00000000000000 --- a/docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_18_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6433ef738eeb00f8d0dc4343ab289073c76321d2e12fe46318fbe374b0f745e2 -size 190271 diff --git a/docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_20_1.png b/docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_20_1.png deleted file mode 100644 index 05c23937df9fe5..00000000000000 --- a/docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_20_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3d67df91f05c9aeb0442a1c4aaef7527cf27e9be0938642eed807f8b5342aa7b -size 26914 diff --git a/docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_22_1.png b/docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_22_1.png deleted file mode 100644 index 61e57d642da114..00000000000000 --- a/docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_22_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:50b9f932b844d99b59b51f2c6947dd048f96bf1553fe36de3975d3a3ad1715e4 -size 8966 diff --git a/docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_24_1.png b/docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_24_1.png deleted file mode 100644 index b113bcf292fe00..00000000000000 --- a/docs/notebooks/meter-reader-with-output_files/meter-reader-with-output_24_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ad7114f80f8925643c865222d0fe0e05d4f65ab54e0b0d354edebe3e5c1ade7c -size 170338 diff --git a/docs/notebooks/minicpm-v-multimodal-chatbot-with-output.rst b/docs/notebooks/minicpm-v-multimodal-chatbot-with-output.rst deleted file mode 100644 index 39c128e061155f..00000000000000 --- a/docs/notebooks/minicpm-v-multimodal-chatbot-with-output.rst +++ /dev/null @@ -1,390 +0,0 @@ -Visual-language assistant with MiniCPM-V2 and OpenVINO -====================================================== - -MiniCPM-V 2 is a strong multimodal large language model for efficient -end-side deployment. MiniCPM-V 2.6 is the latest and most capable model -in the MiniCPM-V series. The model is built on SigLip-400M and Qwen2-7B -with a total of 8B parameters. It exhibits a significant performance -improvement over previous versions, and introduces new features for -multi-image and video understanding. - -More details about model can be found in `model -card `__ and original -`repo `__. - -In this tutorial we consider how to convert and optimize MiniCPM-V2 -model for creating multimodal chatbot. Additionally, we demonstrate how -to apply stateful transformation on LLM part and model optimization -techniques like weights compression using -`NNCF `__ - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Convert model to OpenVINO Intermediate - Representation <#convert-model-to-openvino-intermediate-representation>`__ - - - `Compress Language Model Weights to 4 - bits <#compress-language-model-weights-to-4-bits>`__ - -- `Prepare model inference - pipeline <#prepare-model-inference-pipeline>`__ - - - `Select device <#select-device>`__ - -- `Run OpenVINO model inference <#run-openvino-model-inference>`__ -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "torch>=2.1" "torchvision" "timm>=0.9.2" "transformers>=4.45" "Pillow" "gradio>=4.40" "tqdm" "sentencepiece" "peft" "huggingface-hub>=0.24.0" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "nncf>=2.14.0" - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q -U "openvino>=2024.5" "openvino-tokenizers>=2024.5" "openvino-genai>=2024.5" - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("cmd_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py") - open("cmd_helper.py", "w").write(r.text) - - - if not Path("gradio_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks//minicpm-v-multimodal-chatbot//gradio_helper.py" - ) - open("gradio_helper.py", "w").write(r.text) - - if not Path("notebook_utils.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("minicpm-v-multimodal-chatbot.ipynb") - -Convert model to OpenVINO Intermediate Representation ------------------------------------------------------ - - - -OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate -Representation format. For convenience, we will use OpenVINO integration -with HuggingFace Optimum. `Optimum -Intel `__ is the -interface between the Transformers and Diffusers libraries and the -different tools and libraries provided by Intel to accelerate end-to-end -pipelines on Intel architectures. - -Among other use cases, Optimum Intel provides a simple interface to -optimize your Transformers and Diffusers models, convert them to the -OpenVINO Intermediate Representation (IR) format and run inference using -OpenVINO Runtime. ``optimum-cli`` provides command line interface for -model conversion and optimization. - -General command format: - -.. code:: bash - - optimum-cli export openvino --model --task - -where task is task to export the model for, if not specified, the task -will be auto-inferred based on the model. You can find a mapping between -tasks and model classes in Optimum TaskManager -`documentation `__. -Additionally, you can specify weights compression using -``--weight-format`` argument with one of following options: ``fp32``, -``fp16``, ``int8`` and ``int4``. For int8 and int4 -`nncf `__ will be used for -weight compression. More details about model export provided in `Optimum -Intel -documentation `__. - -Compress Language Model Weights to 4 bits -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -For reducing memory consumption, weights compression optimization can be -applied using `NNCF `__. - -.. raw:: html - -
- -.. raw:: html - - - -Click here for more details about weight compression - -.. raw:: html - - - -Weight compression aims to reduce the memory footprint of a model. It -can also lead to significant performance improvement for large -memory-bound models, such as Large Language Models (LLMs). LLMs and -other models, which require extensive memory to store the weights during -inference, can benefit from weight compression in the following ways: - -- enabling the inference of exceptionally large models that cannot be - accommodated in the memory of the device; - -- improving the inference performance of the models by reducing the - latency of the memory access when computing the operations with - weights, for example, Linear layers. - -`Neural Network Compression Framework -(NNCF) `__ provides 4-bit / -8-bit mixed weight quantization as a compression method primarily -designed to optimize LLMs. The main difference between weights -compression and full model quantization (post-training quantization) is -that activations remain floating-point in the case of weights -compression which leads to a better accuracy. Weight compression for -LLMs provides a solid inference performance improvement which is on par -with the performance of the full model quantization. In addition, weight -compression is data-free and does not require a calibration dataset, -making it easy to use. - -``nncf.compress_weights`` function can be used for performing weights -compression. The function accepts an OpenVINO model and other -compression parameters. Compared to INT8 compression, INT4 compression -improves performance even more, but introduces a minor drop in -prediction quality. - -More details about weights compression, can be found in `OpenVINO -documentation `__. - -.. raw:: html - -
- -.. code:: ipython3 - - from cmd_helper import optimum_cli - import nncf - import openvino as ov - import shutil - import gc - - - def compress_lm_weights(model_dir): - compression_configuration = {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 1.0, "all_layers": True} - ov_model_path = model_dir / "openvino_language_model.xml" - ov_int4_model_path = model_dir / "openvino_language_model_int4.xml" - ov_model = ov.Core().read_model(ov_model_path) - ov_compressed_model = nncf.compress_weights(ov_model, **compression_configuration) - ov.save_model(ov_compressed_model, ov_int4_model_path) - del ov_compressed_model - del ov_model - gc.collect() - ov_model_path.unlink() - ov_model_path.with_suffix(".bin").unlink() - shutil.move(ov_int4_model_path, ov_model_path) - shutil.move(ov_int4_model_path.with_suffix(".bin"), ov_model_path.with_suffix(".bin")) - - - model_id = "openbmb/MiniCPM-V-2_6" - model_dir = Path(model_id.split("/")[-1] + "-ov") - - if not model_dir.exists(): - optimum_cli(model_id, model_dir, additional_args={"trust-remote-code": "", "weight-format": "fp16", "task": "image-text-to-text"}) - compress_lm_weights(model_dir) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -Prepare model inference pipeline --------------------------------- - - - -.. image:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/2727402e-3697-442e-beca-26b149967c84 - -`OpenVINO™ GenAI `__ -is a library of the most popular Generative AI model pipelines, -optimized execution methods, and samples that run on top of highly -performant `OpenVINO -Runtime `__. - -This library is friendly to PC and laptop execution, and optimized for -resource consumption. It requires no external dependencies to run -generative models as it already includes all the core functionality -(e.g. tokenization via openvino-tokenizers). OpenVINO™ GenAI is a flavor -of OpenVINO™, aiming to simplify running inference of generative AI -models. It hides the complexity of the generation process and minimizes -the amount of code required. - -Inference Visual language models can be implemented using OpenVINO GenAI -``VLMPipeline`` class. Similarly to LLMPipeline, that we discussed in -this -`notebook `__. -It supports chat mode with preserving conversational history inside -pipeline, that allows us effectively implements chatbot that supports -conversation about provided images content. - -Select device -~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget(default="AUTO", exclude=["NPU"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - import openvino_genai as ov_genai - - ov_model = ov_genai.VLMPipeline(model_dir, device=device.value) - -Run OpenVINO model inference ----------------------------- - - - -For preparing input data, ``VLMPipeline`` use tokenizer and image -processor inside, we just need to convert image to input OpenVINO tensor -and provide question as string. Additionally, we can provides options -for controlling generation process (e.g. number of maximum generated -tokens or using multinomial sampling for decoding instead of greedy -search approach) using ``GenerationConfig``. - -Generation process for long response may be time consuming, for -accessing partial result as soon as it is generated without waiting when -whole process finished, Streaming API can be used. Token streaming is -the mode in which the generative system returns the tokens one by one as -the model generates them. This enables showing progressive generations -to the user rather than waiting for the whole generation. Streaming is -an essential aspect of the end-user experience as it reduces latency, -one of the most critical aspects of a smooth experience. - -.. code:: ipython3 - - import requests - from PIL import Image - from io import BytesIO - import numpy as np - - image_path = "cat.png" - - - config = ov_genai.GenerationConfig() - config.max_new_tokens = 100 - - - def load_image(image_file): - if isinstance(image_file, str) and (image_file.startswith("http") or image_file.startswith("https")): - response = requests.get(image_file) - image = Image.open(BytesIO(response.content)).convert("RGB") - else: - image = Image.open(image_file).convert("RGB") - image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.byte) - return image, ov.Tensor(image_data) - - - def streamer(subword: str) -> bool: - """ - - Args: - subword: sub-word of the generated text. - - Returns: Return flag corresponds whether generation should be stopped. - - """ - print(subword, end="", flush=True) - - - if not Path(image_path).exists(): - url = "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11" - image = Image.open(requests.get(url, stream=True).raw) - image.save(image_path) - -.. code:: ipython3 - - image, image_tensor = load_image(image_path) - - question = "What is unusual on this image?" - - print(f"Question:\n{question}") - image - - -.. parsed-literal:: - - Question: - What is unusual on this image? - - - - -.. image:: minicpm-v-multimodal-chatbot-with-output_files/minicpm-v-multimodal-chatbot-with-output_12_1.png - - - -.. code:: ipython3 - - ov_model.start_chat() - output = ov_model.generate(question, image=image_tensor, generation_config=config, streamer=streamer) - - -.. parsed-literal:: - - The unusual aspect of this image is the cat's relaxed and vulnerable position. Typically, cats avoid exposing their bellies, which are sensitive and vulnerable areas, to potential threats. In this image, the cat is lying on its back in a cardboard box, exposing its belly and hindquarters, which is not a common sight. This behavior could indicate that the cat feels safe and comfortable in its environment, suggesting a strong bond with its owner and a sense of security in its home. - -Interactive demo ----------------- - - - -.. code:: ipython3 - - from gradio_helper import make_demo - - demo = make_demo(ov_model) - - try: - demo.launch(debug=True, height=600) - except Exception: - demo.launch(debug=True, share=True, height=600) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/minicpm-v-multimodal-chatbot-with-output_files/minicpm-v-multimodal-chatbot-with-output_12_1.jpg b/docs/notebooks/minicpm-v-multimodal-chatbot-with-output_files/minicpm-v-multimodal-chatbot-with-output_12_1.jpg deleted file mode 100644 index c6aeec77cd3cb2..00000000000000 --- a/docs/notebooks/minicpm-v-multimodal-chatbot-with-output_files/minicpm-v-multimodal-chatbot-with-output_12_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5fc0d22d75f23474fb4f8aec8c0bf0fdf5d9377f3379e82a3887003e6da47e7e -size 60425 diff --git a/docs/notebooks/minicpm-v-multimodal-chatbot-with-output_files/minicpm-v-multimodal-chatbot-with-output_12_1.png b/docs/notebooks/minicpm-v-multimodal-chatbot-with-output_files/minicpm-v-multimodal-chatbot-with-output_12_1.png deleted file mode 100644 index c6673a757ab5dc..00000000000000 --- a/docs/notebooks/minicpm-v-multimodal-chatbot-with-output_files/minicpm-v-multimodal-chatbot-with-output_12_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c715d8adee4bf7519690de20b57ef2edaa2f914c86a64d107f99a919dcdad218 -size 854224 diff --git a/docs/notebooks/mllama-3.2-with-output.rst b/docs/notebooks/mllama-3.2-with-output.rst deleted file mode 100644 index 0d60667c9e401f..00000000000000 --- a/docs/notebooks/mllama-3.2-with-output.rst +++ /dev/null @@ -1,832 +0,0 @@ -Visual-language assistant with Llama-3.2-11B-Vision and OpenVINO -================================================================ - -Llama-3.2-11B-Vision is the latest model from LLama3 model family those -capabilities extended to understand images content. The Llama 3.2-Vision -instruction-tuned models are optimized for visual recognition, image -reasoning, captioning, and answering general questions about an image. -Llama 3.2-Vision is built on top of Llama 3.1 text-only model, which is -an auto-regressive language model that uses an optimized transformer -architecture. The tuned versions use supervised fine-tuning (SFT) and -reinforcement learning with human feedback (RLHF) to align with human -preferences for helpfulness and safety. To support image recognition -tasks, the Llama 3.2-Vision model uses a separately trained vision -adapter that integrates with the pre-trained Llama 3.1 language model. -The adapter consists of a series of cross-attention layers that feed -image encoder representations into the core LLM. - -In this tutorial we consider how to convert, optimize and run this model -using OpenVINO. More details about model can be found in `model -card `__, -and original `repo `__. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Convert model <#convert-model>`__ -- `Select inference device <#select-inference-device>`__ -- `Optimize model using NNCF <#optimize-model-using-nncf>`__ - - - `Compress Language model weights in - 4bits <#compress-language-model-weights-in-4bits>`__ - - `Optimize Vision model <#optimize-vision-model>`__ - -- `Model Inference <#model-inference>`__ -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "torch>=2.1" "torchvision" "Pillow" "tqdm" "datasets>=2.14.6" "gradio>=4.36" "nncf>=2.14.0" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "transformers>=4.45" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -Uq --pre "openvino>=2024.5.0" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("ov_mllama_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/mllama3.2/ov_mllama_helper.py") - open("ov_mllama_helper.py", "w").write(r.text) - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/mllama3.2/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - if not Path("ov_mllama_compression.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/mllama3.2/ov_mllama_compression.py") - open("ov_mllama_compression.py", "w").write(r.text) - - if not Path("data_preprocessing.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/mllama3.2/data_preprocessing.py") - open("data_preprocessing", "w").write(r.text) - - if not Path("notebook_utils.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("mllama-3.2.ipynb") - -Convert model -------------- - - - -OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate -Representation (IR). `OpenVINO model conversion -API `__ -should be used for these purposes. ``ov.convert_model`` function accepts -original PyTorch model instance and example input for tracing and -returns ``ov.Model`` representing this model in OpenVINO framework. -Converted model can be used for saving on disk using ``ov.save_model`` -function or directly loading on device using ``core.complie_model``. - -``ov_mllama_helper.py`` script contains helper function for model -conversion, please check its content if you interested in conversion -details. - -.. raw:: html - -
- -Click here for more detailed explanation of conversion steps -Llama-3.2.-Vision is autoregressive transformer generative model, it -means that each next model step depends from model output from previous -step. The generation approach is based on the assumption that the -probability distribution of a word sequence can be decomposed into the -product of conditional next word distributions. In other words, model -predicts the next token in the loop guided by previously generated -tokens until the stop-condition will be not reached (generated sequence -of maximum length or end of string token obtained). The way the next -token will be selected over predicted probabilities is driven by the -selected decoding methodology. You can find more information about the -most popular decoding methods in this -`blog `__. The entry point -for the generation process for models from the Hugging Face Transformers -library is the ``generate`` method. You can find more information about -its parameters and configuration in the -`documentation `__. -To preserve flexibility in the selection decoding methodology, we will -convert only model inference for one step. - -The inference flow has difference on first step and for the next. On the -first step, model accept preprocessed input instruction and image. Image -processed via ``Image Encoder`` to cross-attention state, after that -``language model``, LLM-based part of model, runs on cross-attention -states and tokenized input token ids to predict probability of next -generated tokens. On the next step, ``language_model`` accepts only next -token. Since the output side is auto-regressive, an output token hidden -state remains the same once computed for every further generation step. -Therefore, recomputing it every time you want to generate a new token -seems wasteful. With the cache, the model saves the hidden state once it -has been computed. The model only computes the one for the most recently -generated output token at each time step, re-using the saved ones for -hidden tokens. This reduces the generation complexity from -:math:`O(n^3)` to :math:`O(n^2)` for a transformer model. More details -about how it works can be found in this -`article `__. - -With increasing model size like in modern LLMs, we also can note an -increase in the number of attention blocks and size past key values -tensors respectively. The strategy for handling cache state as model -inputs and outputs in the inference cycle may become a bottleneck for -memory-bounded systems, especially with processing long input sequences, -for example in a chatbot scenario. OpenVINO suggests a transformation -that removes inputs and corresponding outputs with cache tensors from -the model keeping cache handling logic inside the model. Such models are -also called stateful. A stateful model is a model that implicitly -preserves data between two consecutive inference calls. The tensors -saved from one run are kept in an internal memory buffer called a -``state`` or a ``variable`` and may be passed to the next run, while -never being exposed as model output. Hiding the cache enables storing -and updating the cache values in a more device-friendly representation. -It helps to reduce memory consumption and additionally optimize model -performance. More details about stateful models and working with state -can be found in `OpenVINO -documentation `__. - -``image_encoder`` is represented in Llama-3.2-Vision by pretrained VIT -model. - -To sum up above, model consists of 2 parts: - -- **Image Encoder** for encoding input images into LLM cross attention - states space. -- **Language Model** for generation answer based on cross attention - states provided by Image Encoder and input tokens. - -Let’s convert each model part. - -.. raw:: html - -
- -.. - - **Note**: run model with notebook, you will need to accept license - agreement. You must be a registered user in Hugging Face Hub. - Please visit `HuggingFace model - card `__, - carefully read terms of usage and click accept button. You will need - to use an access token for the code below to run. For more - information on access tokens, refer to `this section of the - documentation `__. - You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: ipython3 - - # uncomment these lines to login to huggingfacehub to get access to pretrained model - - # from huggingface_hub import notebook_login, whoami - - # try: - # whoami() - # print('Authorization token already provided') - # except OSError: - # notebook_login() - -.. code:: ipython3 - - from pathlib import Path - from ov_mllama_helper import convert_mllama - - model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct" - model_dir = Path(model_id.split("/")[-1]) / "OV" - - # uncomment the line to see model conversion code - # convert_mllama?? - - -.. parsed-literal:: - - 2025-01-07 08:39:57.815213: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2025-01-07 08:39:57.827771: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered - WARNING: All log messages before absl::InitializeLog() is called are written to STDERR - E0000 00:00:1736224797.842114 2088673 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered - E0000 00:00:1736224797.846261 2088673 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered - 2025-01-07 08:39:57.861492: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - - -.. code:: ipython3 - - convert_mllama(model_id, model_dir) - - -.. parsed-literal:: - - ⌛ Load original model - - - -.. parsed-literal:: - - Downloading shards: 0%| | 0/5 [00:00 0 else None - - -.. parsed-literal:: - - ✅ Vision model successfully converted - ⌛ Convert language model... - - -.. parsed-literal:: - - /home/ea/work/py311/lib/python3.11/site-packages/transformers/cache_utils.py:458: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. - or len(self.key_cache[layer_idx]) == 0 # the layer has no cache - /home/ea/work/py311/lib/python3.11/site-packages/transformers/models/mllama/modeling_mllama.py:1819: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if sequence_length != 1: - /home/ea/work/py311/lib/python3.11/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. - elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors - /home/ea/work/py311/lib/python3.11/site-packages/transformers/models/mllama/modeling_mllama.py:1653: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if is_cross_attention_layer and cross_attention_states is None and is_cross_attention_cache_empty: - /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/mllama-3.2/ov_mllama_helper.py:401: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - elif past_key_value.get_seq_length(self.layer_idx) != 0: - - -.. parsed-literal:: - - ✅ Language model successfully converted - ✅ Model sucessfully converted and can be found in Llama-3.2-11B-Vision-Instruct/OV - - -Select inference device ------------------------ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget("CPU", exclude=["NPU"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -Optimize model using NNCF -------------------------- - - - -Compress Language model weights in 4bits -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -For reducing memory consumption, weights compression optimization can be -applied using `NNCF `__. - -.. raw:: html - -
- -Click here for more details about weight compression Weight compression -aims to reduce the memory footprint of a model. It can also lead to -significant performance improvement for large memory-bound models, such -as Large Language Models (LLMs). LLMs and other models, which require -extensive memory to store the weights during inference, can benefit from -weight compression in the following ways: - -- enabling the inference of exceptionally large models that cannot be - accommodated in the memory of the device; - -- improving the inference performance of the models by reducing the - latency of the memory access when computing the operations with - weights, for example, Linear layers. - -`Neural Network Compression Framework -(NNCF) `__ provides 4-bit / -8-bit mixed weight quantization as a compression method primarily -designed to optimize LLMs. The main difference between weights -compression and full model quantization (post-training quantization) is -that activations remain floating-point in the case of weights -compression which leads to a better accuracy. Weight compression for -LLMs provides a solid inference performance improvement which is on par -with the performance of the full model quantization. In addition, weight -compression is data-free and does not require a calibration dataset, -making it easy to use. - -``nncf.compress_weights`` function can be used for performing weights -compression. The function accepts an OpenVINO model and other -compression parameters. Compared to INT8 compression, INT4 compression -improves performance even more, but introduces a minor drop in -prediction quality. - -More details about weights compression, can be found in `OpenVINO -documentation `__. - -.. raw:: html - -
- -In this tutorial we consider usage Data-Aware weights compression. Such -approaches may require more time and memory as they involves calibration -dataset, while promising better int4 model accuracy. > **Note:** AWQ -weight quantization requires at least 64GB RAM, if you run notebook in -memory-constrained environment, you can switch to data-free weight -compression using widget bellow - -.. code:: ipython3 - - from ov_mllama_compression import compress - - # uncomment the line to see compression code - # compress?? - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -.. code:: ipython3 - - from ov_mllama_compression import compression_widgets_helper - - compression_scenario, compress_args = compression_widgets_helper() - - compression_scenario - - - - -.. parsed-literal:: - - VBox(children=(RadioButtons(index=1, options=('data-free', 'data-aware'), value='data-aware'), Accordion(child… - - - -.. code:: ipython3 - - compression_kwargs = {key: value.value for key, value in compress_args.items()} - - language_model_path = compress(model_dir, **compression_kwargs) - - -.. parsed-literal:: - - ⌛ Dataset preparation started - Fetching 64 samples for the initialization... - - - -.. parsed-literal:: - - 0%| | 0/64 [00:00`__. - -Basically model quantization process consists of 3 steps: 1. Prepare -quantization dataset 2. Perform model quantization using -``nncf.quantize`` 3. Save optimized model on disk using -``ov.save_model`` - - **Note:** Model quantization may requires additional time and memory - for optimization and be non-applicable for some devices. You can skip - quantization step or replace it with weight compression using widget - bellow if you does not have enough resources. - -.. code:: ipython3 - - from ov_mllama_compression import vision_encoder_selection_widget - - vision_encoder_options = vision_encoder_selection_widget() - - vision_encoder_options - - - - -.. parsed-literal:: - - Dropdown(description='Vision Encoder', index=1, options=('FP16', 'INT8 quantization', 'INT8 weights compressio… - - - -.. code:: ipython3 - - from transformers import AutoProcessor - import nncf - import openvino as ov - import gc - - from data_preprocessing import prepare_dataset_vision - - processor = AutoProcessor.from_pretrained(model_dir) - core = ov.Core() - - fp_vision_encoder_path = model_dir / "openvino_vision_encoder.xml" - int8_vision_encoder_path = model_dir / fp_vision_encoder_path.name.replace(".xml", "_int8.xml") - int8_wc_vision_encoder_path = model_dir / fp_vision_encoder_path.name.replace(".xml", "_int8_wc.xml") - - - if vision_encoder_options.value == "INT8 quantization": - if not int8_vision_encoder_path.exists(): - calibration_data = prepare_dataset_vision(processor, 100) - ov_model = core.read_model(fp_vision_encoder_path) - calibration_dataset = nncf.Dataset(calibration_data) - quantized_model = nncf.quantize( - model=ov_model, - calibration_dataset=calibration_dataset, - model_type=nncf.ModelType.TRANSFORMER, - advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alphas=nncf.AdvancedSmoothQuantParameters(matmul=0.6)), - subset_size=100, - ) - ov.save_model(quantized_model, int8_vision_encoder_path) - del quantized_model - del ov_model - del calibration_dataset - del calibration_data - gc.collect() - - vision_encoder_path = int8_vision_encoder_path - elif vision_encoder_options.value == "INT8 weights compression": - if not int8_wc_vision_encoder_path.exists(): - ov_model = core.read_model(fp_vision_encoder_path) - compressed_model = nncf.compress_weights(ov_model) - ov.save_model(compressed_model, int8_wc_vision_encoder_path) - vision_encoder_path = int8_wc_vision_encoder_path - else: - vision_encoder_path = fp_vision_encoder_path - - -.. parsed-literal:: - - /home/ea/work/py311/lib/python3.11/site-packages/nncf/quantization/algorithms/post_training/pipeline.py:87: FutureWarning: `AdvancedQuantizationParameters(smooth_quant_alpha=..)` is deprecated.Please, use `AdvancedQuantizationParameters(smooth_quant_alphas)` option with AdvancedSmoothQuantParameters(convolution=.., matmul=..) as value instead. - warning_deprecated( - - - -.. parsed-literal:: - - Output() - - - - - - - - - -.. parsed-literal:: - - WARNING:nncf:Dataset contains only 100 samples, smaller than the requested subset size 300. - - - -.. parsed-literal:: - - Output() - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - -.. parsed-literal:: - - WARNING:nncf:Dataset contains only 100 samples, smaller than the requested subset size 300. - - -.. parsed-literal:: - - /home/ea/work/py311/lib/python3.11/site-packages/numpy/core/_methods.py:118: RuntimeWarning: overflow encountered in reduce - ret = umr_sum(arr, axis, dtype, out, keepdims, where=where) - - - -.. parsed-literal:: - - Output() - - - - - - - - - -Model Inference ---------------- - - - -Now, we are ready to test model inference. -``OVOVMLlamaForConditionalGeneration`` defined in -``ov_mllama_helper.py`` has similar generation interface with original -model and additionally enables runtime optimizations for efficient model -inference with OpenVINO: - **Slicing LM head** - usually LLM models -provides probability for all input tokens, while for selection next -token, we are interested only for the last one. Reducing Language Model -head size to return only last token probability may provide better -performance and reduce memory consumption for the first inference, where -usually whole input prompt processed. You can find more details about -this optimization in `OpenVINO -blog `__ - -.. raw:: html - -

- -.. raw:: html - -

- -- **Using Remote tensors for GPU** - Coping data on device and back - into host memory can become bottleneck for efficient execution - multi-model pipeline on GPU. `Remote Tensor - API `__ - provides functionality for low-level GPU memory management, we can - use this feature for sharing cross-attention keys and values between - Image Encoder and Language Model. - -.. code:: ipython3 - - from ov_mllama_helper import OVMLlamaForConditionalGeneration - - # Uncomment this line to see model inference code - # OVMLlamaForConditionalGeneration?? - - ov_model = OVMLlamaForConditionalGeneration( - model_dir, device=device.value, language_model_name=language_model_path.name, image_encoder_name=vision_encoder_path.name - ) - processor = AutoProcessor.from_pretrained(model_dir) - - -.. parsed-literal:: - - applied slice for lm head - - -.. code:: ipython3 - - from PIL import Image - from transformers import TextStreamer - import numpy as np - - question = "What is unusual on this image?" - - messages = [ - {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": question}]}, - ] - text = processor.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) - - input_image_path = Path("cat.png") - if not input_image_path.exists(): - url = "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11" - raw_image = Image.open(requests.get(url, stream=True).raw) - raw_image.save(input_image_path) - else: - raw_image = Image.open(input_image_path) - - inputs = processor(text=text, images=[raw_image], return_tensors="pt") - streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True) - print(f"Question: {question}") - display(raw_image) - output = ov_model.generate(**inputs, do_sample=False, max_new_tokens=100, temperature=None, top_p=None, streamer=streamer) - print(f"Visual encoder time {ov_model.vision_encoder_infer_time[0] * 1000 :.2f} ms") - print(f"First token latency {ov_model.llm_infer_time[0] * 1000 :.2f}ms, Second token latency {np.mean(np.array(ov_model.llm_infer_time[1:])) * 1000:.2f}ms") - - -.. parsed-literal:: - - Question: What is unusual on this image? - - - -.. image:: mllama-3.2-with-output_files/mllama-3.2-with-output_19_1.png - - -.. parsed-literal:: - - The cat is lying in a box. The cat is lying in a box, which is unusual because cats are known for their love of boxes. The cat's unusual behavior of lying in a box is likely due to its natural instinct to seek out small, enclosed spaces for rest and relaxation. - Visual encoder time 19083.66 ms - First token latency 2937.73ms, Second token latency 175.03ms - - -Interactive demo ----------------- - - - -.. code:: ipython3 - - from gradio_helper import make_demo - - processor.chat_template = processor.tokenizer.chat_template - demo = make_demo(ov_model, processor) - - try: - demo.launch(debug=False) - except Exception: - demo.launch(debug=False, share=True) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ - - -.. parsed-literal:: - - /home/ea/work/py311/lib/python3.11/site-packages/gradio/components/chatbot.py:228: UserWarning: The 'tuples' format for chatbot messages is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style 'role' and 'content' keys. - warnings.warn( - - -.. parsed-literal:: - - * Running on local URL: http://127.0.0.1:7860 - Rerunning server... use `close()` to stop if you need to change `launch()` parameters. - ---- - * Running on public URL: https://5276392df9bae2f87b.gradio.live - - This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces) - - - - - - - diff --git a/docs/notebooks/mllama-3.2-with-output_files/mllama-3.2-with-output_19_1.jpg b/docs/notebooks/mllama-3.2-with-output_files/mllama-3.2-with-output_19_1.jpg deleted file mode 100644 index c6aeec77cd3cb2..00000000000000 --- a/docs/notebooks/mllama-3.2-with-output_files/mllama-3.2-with-output_19_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5fc0d22d75f23474fb4f8aec8c0bf0fdf5d9377f3379e82a3887003e6da47e7e -size 60425 diff --git a/docs/notebooks/mllama-3.2-with-output_files/mllama-3.2-with-output_19_1.png b/docs/notebooks/mllama-3.2-with-output_files/mllama-3.2-with-output_19_1.png deleted file mode 100644 index c6673a757ab5dc..00000000000000 --- a/docs/notebooks/mllama-3.2-with-output_files/mllama-3.2-with-output_19_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c715d8adee4bf7519690de20b57ef2edaa2f914c86a64d107f99a919dcdad218 -size 854224 diff --git a/docs/notebooks/mms-massively-multilingual-speech-with-output.rst b/docs/notebooks/mms-massively-multilingual-speech-with-output.rst deleted file mode 100644 index 7176885ad5f7a1..00000000000000 --- a/docs/notebooks/mms-massively-multilingual-speech-with-output.rst +++ /dev/null @@ -1,961 +0,0 @@ -MMS: Scaling Speech Technology to 1000+ languages with OpenVINO™ -================================================================ - -The Massively Multilingual Speech (MMS) project expands speech -technology from about 100 languages to over 1,000 by building a single -multilingual speech recognition model supporting over 1,100 languages -(more than 10 times as many as before), language identification models -able to identify over 4,000 languages (40 times more than before), -pretrained models supporting over 1,400 languages, and text-to-speech -models for over 1,100 languages. - -The MMS model was proposed in `Scaling Speech Technology to 1,000+ -Languages `__. The models and code are -originally released -`here `__. - -There are different open sourced models in the MMS project: Automatic -Speech Recognition (ASR), Language Identification (LID) and Speech -Synthesis (TTS). A simple diagram of this is below. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/76171391/0e7fadd6-29a8-4fac-bd9c-41d66adcb045 - :alt: LID and ASR flow - - LID and ASR flow - -In this notebook we are considering ASR and LID. We will use LID model -to identify language, and then language-specific ASR model to recognize -it. Additional models quantization step is employed to improve models -inference speed. In the end of the notebook there’s a Gradio-based -interactive demo. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Prepare an example audio <#prepare-an-example-audio>`__ -- `Language Identification (LID) <#language-identification-lid>`__ - - - `Download pretrained model and - processor <#download-pretrained-model-and-processor>`__ - - `Use the original model to run an - inference <#use-the-original-model-to-run-an-inference>`__ - - `Convert to OpenVINO IR model and run an - inference <#convert-to-openvino-ir-model-and-run-an-inference>`__ - -- `Automatic Speech Recognition - (ASR) <#automatic-speech-recognition-asr>`__ - - - `Download pretrained model and - processor <#download-pretrained-model-and-processor>`__ - - `Use the original model for - inference <#use-the-original-model-for-inference>`__ - - `Convert to OpenVINO IR model and run - inference <#convert-to-openvino-ir-model-and-run-inference>`__ - -- `Quantization <#quantization>`__ - - - `Preparing calibration dataset <#preparing-calibration-dataset>`__ - - `Language identification model - quantization <#language-identification-model-quantization>`__ - - `Speech recognition model - quantization <#speech-recognition-model-quantization>`__ - - `Compare model size, performance and - accuracy <#compare-model-size-performance-and-accuracy>`__ - -- `Interactive demo with Gradio <#interactive-demo-with-gradio>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q --upgrade pip - %pip install -q "transformers>=4.33.1" "torch>=2.1" "openvino>=2023.1.0" "numpy>=1.21.0" "nncf>=2.9.0" - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu torch "datasets>=2.14.6" accelerate soundfile librosa "gradio>=4.19" jiwer - -.. code:: ipython3 - - from pathlib import Path - - import torch - - import openvino as ov - - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("mms-massively-multilingual-speech.ipynb") - -Prepare an example audio ------------------------- - - - -Read an audio file and process the audio data. Make sure that the audio -data is sampled to 16000 kHz. For this example we will use `a streamable -version of the Multilingual LibriSpeech (MLS) -dataset `__. -It supports contains example on 7 languages: -``'german', 'dutch', 'french', 'spanish', 'italian', 'portuguese', 'polish'``. -Choose one of them. - -.. code:: ipython3 - - import ipywidgets as widgets - - - SAMPLE_LANG = widgets.Dropdown( - options=["german", "dutch", "french", "spanish", "italian", "portuguese", "polish"], - value="german", - description="Dataset language:", - disabled=False, - ) - - SAMPLE_LANG - - - - -.. parsed-literal:: - - Dropdown(description='Dataset language:', options=('german', 'dutch', 'french', 'spanish', 'italian', 'portugu… - - - -Specify ``streaming=True`` to not download the entire dataset. - -.. code:: ipython3 - - from datasets import load_dataset - - - mls_dataset = load_dataset("facebook/multilingual_librispeech", SAMPLE_LANG.value, split="test", streaming=True, trust_remote_code=True) - mls_dataset = iter(mls_dataset) # make it iterable - - example = next(mls_dataset) # get one example - -Example has a dictionary structure. It contains an audio data and a text -transcription. - -.. code:: ipython3 - - print(example) # look at structure - - -.. parsed-literal:: - - {'file': None, 'audio': {'path': '1054_1599_000000.flac', 'array': array([-0.00131226, -0.00152588, -0.00134277, ..., 0.00411987, - 0.00308228, -0.00015259]), 'sampling_rate': 16000}, 'text': 'mein sechster sohn scheint wenigstens auf den ersten blick der tiefsinnigste von allen ein kopfhänger und doch ein schwätzer deshalb kommt man ihm nicht leicht bei ist er am unterliegen so verfällt er in unbesiegbare traurigkeit', 'speaker_id': 1054, 'chapter_id': 1599, 'id': '1054_1599_000000'} - - -.. code:: ipython3 - - import IPython.display as ipd - - print(example["transcript"]) - ipd.Audio(example["audio"]["array"], rate=16_000) - - -.. parsed-literal:: - - mein sechster sohn scheint wenigstens auf den ersten blick der tiefsinnigste von allen ein kopfhänger und doch ein schwätzer deshalb kommt man ihm nicht leicht bei ist er am unterliegen so verfällt er in unbesiegbare traurigkeit - - - - -.. raw:: html - - - - - - - -Language Identification (LID) ------------------------------ - - - -Download pretrained model and processor -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Different LID models are available based on the number of languages they -can recognize - 126, 256, 512, 1024, 2048, 4017. We will use 126. - -.. code:: ipython3 - - from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor - - model_id = "facebook/mms-lid-126" - - lid_processor = AutoFeatureExtractor.from_pretrained(model_id) - lid_model = Wav2Vec2ForSequenceClassification.from_pretrained(model_id) - -Use the original model to run an inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - inputs = lid_processor(example["audio"]["array"], sampling_rate=16_000, return_tensors="pt") - - with torch.no_grad(): - outputs = lid_model(**inputs).logits - - lang_id = torch.argmax(outputs, dim=-1)[0].item() - detected_lang = lid_model.config.id2label[lang_id] - print(detected_lang) - - -.. parsed-literal:: - - deu - - -Convert to OpenVINO IR model and run an inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - core = ov.Core() - - device = device_widget("CPU", exclude=["NPU"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Convert model to OpenVINO format and compile it - -.. code:: ipython3 - - MAX_SEQ_LENGTH = 30480 - - lid_model_xml_path = Path("models/ov_lid_model.xml") - - - def get_lid_model(model_path, compiled=True): - input_values = torch.zeros([1, MAX_SEQ_LENGTH], dtype=torch.float) - - if not model_path.exists() and model_path == lid_model_xml_path: - lid_model_xml_path.parent.mkdir(parents=True, exist_ok=True) - converted_model = ov.convert_model(lid_model, example_input={"input_values": input_values}) - ov.save_model(converted_model, lid_model_xml_path) - if not compiled: - return converted_model - if compiled: - return core.compile_model(model_path, device_name=device.value) - return core.read_model(model_path) - - - compiled_lid_model = get_lid_model(lid_model_xml_path) - - -.. parsed-literal:: - - /home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:595: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): - /home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:634: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): - - -Now it is possible to run an inference. - -.. code:: ipython3 - - def detect_language(compiled_model, audio_data): - inputs = lid_processor(audio_data, sampling_rate=16_000, return_tensors="pt") - - outputs = compiled_model(inputs["input_values"])[0] - - lang_id = torch.argmax(torch.from_numpy(outputs), dim=-1)[0].item() - detected_lang = lid_model.config.id2label[lang_id] - - return detected_lang - -.. code:: ipython3 - - detect_language(compiled_lid_model, example["audio"]["array"]) - - - - -.. parsed-literal:: - - 'deu' - - - -Let’s check another language. - -.. code:: ipython3 - - SAMPLE_LANG = widgets.Dropdown( - options=["german", "dutch", "french", "spanish", "italian", "portuguese", "polish"], - value="french", - description="Dataset language:", - disabled=False, - ) - - SAMPLE_LANG - - - - -.. parsed-literal:: - - Dropdown(description='Dataset language:', index=2, options=('german', 'dutch', 'french', 'spanish', 'italian',… - - - -.. code:: ipython3 - - mls_dataset = load_dataset("facebook/multilingual_librispeech", SAMPLE_LANG.value, split="test", streaming=True, trust_remote_code=True) - mls_dataset = iter(mls_dataset) - - example = next(mls_dataset) - print(example["transcript"]) - ipd.Audio(example["audio"]["array"], rate=16_000) - - -.. parsed-literal:: - - grisé par ce parfum il fit des vers en l'honneur de l'humble fleur des bois et il les récita tout haut à ses pieds une violette l'entendit elle crut qu'il ne parlait que pour elle - - - - -.. raw:: html - - - - - - - -.. code:: ipython3 - - language_id = detect_language(compiled_lid_model, example["audio"]["array"]) - print(language_id) - - -.. parsed-literal:: - - fra - - -Automatic Speech Recognition (ASR) ----------------------------------- - - - -Download pretrained model and processor -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Download pretrained model and processor. By default, MMS loads adapter -weights for English. If you want to load adapter weights of another -language make sure to specify ``target_lang=`` -as well as ``ignore_mismatched_sizes=True``. The -``ignore_mismatched_sizes=True`` keyword has to be passed to allow the -language model head to be resized according to the vocabulary of the -specified language. Similarly, the processor should be loaded with the -same target language. It is also possible to change the supported -language later. - -.. code:: ipython3 - - from transformers import Wav2Vec2ForCTC, AutoProcessor - - model_id = "facebook/mms-1b-all" - - asr_processor = AutoProcessor.from_pretrained(model_id) - asr_model = Wav2Vec2ForCTC.from_pretrained(model_id) - -You can look at all supported languages: - -.. code:: ipython3 - - asr_processor.tokenizer.vocab.keys() - - - - -.. parsed-literal:: - - dict_keys(['abi', 'abk', 'abp', 'aca', 'acd', 'ace', 'acf', 'ach', 'acn', 'acr', 'acu', 'ade', 'adh', 'adj', 'adx', 'aeu', 'afr', 'agd', 'agg', 'agn', 'agr', 'agu', 'agx', 'aha', 'ahk', 'aia', 'aka', 'akb', 'ake', 'akp', 'alj', 'alp', 'alt', 'alz', 'ame', 'amf', 'amh', 'ami', 'amk', 'ann', 'any', 'aoz', 'apb', 'apr', 'ara', 'arl', 'asa', 'asg', 'asm', 'ast', 'ata', 'atb', 'atg', 'ati', 'atq', 'ava', 'avn', 'avu', 'awa', 'awb', 'ayo', 'ayr', 'ayz', 'azb', 'azg', 'azj-script_cyrillic', 'azj-script_latin', 'azz', 'bak', 'bam', 'ban', 'bao', 'bas', 'bav', 'bba', 'bbb', 'bbc', 'bbo', 'bcc-script_arabic', 'bcc-script_latin', 'bcl', 'bcw', 'bdg', 'bdh', 'bdq', 'bdu', 'bdv', 'beh', 'bel', 'bem', 'ben', 'bep', 'bex', 'bfa', 'bfo', 'bfy', 'bfz', 'bgc', 'bgq', 'bgr', 'bgt', 'bgw', 'bha', 'bht', 'bhz', 'bib', 'bim', 'bis', 'biv', 'bjr', 'bjv', 'bjw', 'bjz', 'bkd', 'bkv', 'blh', 'blt', 'blx', 'blz', 'bmq', 'bmr', 'bmu', 'bmv', 'bng', 'bno', 'bnp', 'boa', 'bod', 'boj', 'bom', 'bor', 'bos', 'bov', 'box', 'bpr', 'bps', 'bqc', 'bqi', 'bqj', 'bqp', 'bre', 'bru', 'bsc', 'bsq', 'bss', 'btd', 'bts', 'btt', 'btx', 'bud', 'bul', 'bus', 'bvc', 'bvz', 'bwq', 'bwu', 'byr', 'bzh', 'bzi', 'bzj', 'caa', 'cab', 'cac-dialect_sanmateoixtatan', 'cac-dialect_sansebastiancoatan', 'cak-dialect_central', 'cak-dialect_santamariadejesus', 'cak-dialect_santodomingoxenacoj', 'cak-dialect_southcentral', 'cak-dialect_western', 'cak-dialect_yepocapa', 'cap', 'car', 'cas', 'cat', 'cax', 'cbc', 'cbi', 'cbr', 'cbs', 'cbt', 'cbu', 'cbv', 'cce', 'cco', 'cdj', 'ceb', 'ceg', 'cek', 'ces', 'cfm', 'cgc', 'che', 'chf', 'chv', 'chz', 'cjo', 'cjp', 'cjs', 'ckb', 'cko', 'ckt', 'cla', 'cle', 'cly', 'cme', 'cmn-script_simplified', 'cmo-script_khmer', 'cmo-script_latin', 'cmr', 'cnh', 'cni', 'cnl', 'cnt', 'coe', 'cof', 'cok', 'con', 'cot', 'cou', 'cpa', 'cpb', 'cpu', 'crh', 'crk-script_latin', 'crk-script_syllabics', 'crn', 'crq', 'crs', 'crt', 'csk', 'cso', 'ctd', 'ctg', 'cto', 'ctu', 'cuc', 'cui', 'cuk', 'cul', 'cwa', 'cwe', 'cwt', 'cya', 'cym', 'daa', 'dah', 'dan', 'dar', 'dbj', 'dbq', 'ddn', 'ded', 'des', 'deu', 'dga', 'dgi', 'dgk', 'dgo', 'dgr', 'dhi', 'did', 'dig', 'dik', 'dip', 'div', 'djk', 'dnj-dialect_blowowest', 'dnj-dialect_gweetaawueast', 'dnt', 'dnw', 'dop', 'dos', 'dsh', 'dso', 'dtp', 'dts', 'dug', 'dwr', 'dyi', 'dyo', 'dyu', 'dzo', 'eip', 'eka', 'ell', 'emp', 'enb', 'eng', 'enx', 'epo', 'ese', 'ess', 'est', 'eus', 'evn', 'ewe', 'eza', 'fal', 'fao', 'far', 'fas', 'fij', 'fin', 'flr', 'fmu', 'fon', 'fra', 'frd', 'fry', 'ful', 'gag-script_cyrillic', 'gag-script_latin', 'gai', 'gam', 'gau', 'gbi', 'gbk', 'gbm', 'gbo', 'gde', 'geb', 'gej', 'gil', 'gjn', 'gkn', 'gld', 'gle', 'glg', 'glk', 'gmv', 'gna', 'gnd', 'gng', 'gof-script_latin', 'gog', 'gor', 'gqr', 'grc', 'gri', 'grn', 'grt', 'gso', 'gub', 'guc', 'gud', 'guh', 'guj', 'guk', 'gum', 'guo', 'guq', 'guu', 'gux', 'gvc', 'gvl', 'gwi', 'gwr', 'gym', 'gyr', 'had', 'hag', 'hak', 'hap', 'hat', 'hau', 'hay', 'heb', 'heh', 'hif', 'hig', 'hil', 'hin', 'hlb', 'hlt', 'hne', 'hnn', 'hns', 'hoc', 'hoy', 'hrv', 'hsb', 'hto', 'hub', 'hui', 'hun', 'hus-dialect_centralveracruz', 'hus-dialect_westernpotosino', 'huu', 'huv', 'hvn', 'hwc', 'hye', 'hyw', 'iba', 'ibo', 'icr', 'idd', 'ifa', 'ifb', 'ife', 'ifk', 'ifu', 'ify', 'ign', 'ikk', 'ilb', 'ilo', 'imo', 'ina', 'inb', 'ind', 'iou', 'ipi', 'iqw', 'iri', 'irk', 'isl', 'ita', 'itl', 'itv', 'ixl-dialect_sangasparchajul', 'ixl-dialect_sanjuancotzal', 'ixl-dialect_santamarianebaj', 'izr', 'izz', 'jac', 'jam', 'jav', 'jbu', 'jen', 'jic', 'jiv', 'jmc', 'jmd', 'jpn', 'jun', 'juy', 'jvn', 'kaa', 'kab', 'kac', 'kak', 'kam', 'kan', 'kao', 'kaq', 'kat', 'kay', 'kaz', 'kbo', 'kbp', 'kbq', 'kbr', 'kby', 'kca', 'kcg', 'kdc', 'kde', 'kdh', 'kdi', 'kdj', 'kdl', 'kdn', 'kdt', 'kea', 'kek', 'ken', 'keo', 'ker', 'key', 'kez', 'kfb', 'kff-script_telugu', 'kfw', 'kfx', 'khg', 'khm', 'khq', 'kia', 'kij', 'kik', 'kin', 'kir', 'kjb', 'kje', 'kjg', 'kjh', 'kki', 'kkj', 'kle', 'klu', 'klv', 'klw', 'kma', 'kmd', 'kml', 'kmr-script_arabic', 'kmr-script_cyrillic', 'kmr-script_latin', 'kmu', 'knb', 'kne', 'knf', 'knj', 'knk', 'kno', 'kog', 'kor', 'kpq', 'kps', 'kpv', 'kpy', 'kpz', 'kqe', 'kqp', 'kqr', 'kqy', 'krc', 'kri', 'krj', 'krl', 'krr', 'krs', 'kru', 'ksb', 'ksr', 'kss', 'ktb', 'ktj', 'kub', 'kue', 'kum', 'kus', 'kvn', 'kvw', 'kwd', 'kwf', 'kwi', 'kxc', 'kxf', 'kxm', 'kxv', 'kyb', 'kyc', 'kyf', 'kyg', 'kyo', 'kyq', 'kyu', 'kyz', 'kzf', 'lac', 'laj', 'lam', 'lao', 'las', 'lat', 'lav', 'law', 'lbj', 'lbw', 'lcp', 'lee', 'lef', 'lem', 'lew', 'lex', 'lgg', 'lgl', 'lhu', 'lia', 'lid', 'lif', 'lin', 'lip', 'lis', 'lit', 'lje', 'ljp', 'llg', 'lln', 'lme', 'lnd', 'lns', 'lob', 'lok', 'lom', 'lon', 'loq', 'lsi', 'lsm', 'ltz', 'luc', 'lug', 'luo', 'lwo', 'lww', 'lzz', 'maa-dialect_sanantonio', 'maa-dialect_sanjeronimo', 'mad', 'mag', 'mah', 'mai', 'maj', 'mak', 'mal', 'mam-dialect_central', 'mam-dialect_northern', 'mam-dialect_southern', 'mam-dialect_western', 'maq', 'mar', 'maw', 'maz', 'mbb', 'mbc', 'mbh', 'mbj', 'mbt', 'mbu', 'mbz', 'mca', 'mcb', 'mcd', 'mco', 'mcp', 'mcq', 'mcu', 'mda', 'mdf', 'mdv', 'mdy', 'med', 'mee', 'mej', 'men', 'meq', 'met', 'mev', 'mfe', 'mfh', 'mfi', 'mfk', 'mfq', 'mfy', 'mfz', 'mgd', 'mge', 'mgh', 'mgo', 'mhi', 'mhr', 'mhu', 'mhx', 'mhy', 'mib', 'mie', 'mif', 'mih', 'mil', 'mim', 'min', 'mio', 'mip', 'miq', 'mit', 'miy', 'miz', 'mjl', 'mjv', 'mkd', 'mkl', 'mkn', 'mlg', 'mlt', 'mmg', 'mnb', 'mnf', 'mnk', 'mnw', 'mnx', 'moa', 'mog', 'mon', 'mop', 'mor', 'mos', 'mox', 'moz', 'mpg', 'mpm', 'mpp', 'mpx', 'mqb', 'mqf', 'mqj', 'mqn', 'mri', 'mrw', 'msy', 'mtd', 'mtj', 'mto', 'muh', 'mup', 'mur', 'muv', 'muy', 'mvp', 'mwq', 'mwv', 'mxb', 'mxq', 'mxt', 'mxv', 'mya', 'myb', 'myk', 'myl', 'myv', 'myx', 'myy', 'mza', 'mzi', 'mzj', 'mzk', 'mzm', 'mzw', 'nab', 'nag', 'nan', 'nas', 'naw', 'nca', 'nch', 'ncj', 'ncl', 'ncu', 'ndj', 'ndp', 'ndv', 'ndy', 'ndz', 'neb', 'new', 'nfa', 'nfr', 'nga', 'ngl', 'ngp', 'ngu', 'nhe', 'nhi', 'nhu', 'nhw', 'nhx', 'nhy', 'nia', 'nij', 'nim', 'nin', 'nko', 'nlc', 'nld', 'nlg', 'nlk', 'nmz', 'nnb', 'nno', 'nnq', 'nnw', 'noa', 'nob', 'nod', 'nog', 'not', 'npi', 'npl', 'npy', 'nso', 'nst', 'nsu', 'ntm', 'ntr', 'nuj', 'nus', 'nuz', 'nwb', 'nxq', 'nya', 'nyf', 'nyn', 'nyo', 'nyy', 'nzi', 'obo', 'oci', 'ojb-script_latin', 'ojb-script_syllabics', 'oku', 'old', 'omw', 'onb', 'ood', 'orm', 'ory', 'oss', 'ote', 'otq', 'ozm', 'pab', 'pad', 'pag', 'pam', 'pan', 'pao', 'pap', 'pau', 'pbb', 'pbc', 'pbi', 'pce', 'pcm', 'peg', 'pez', 'pib', 'pil', 'pir', 'pis', 'pjt', 'pkb', 'pls', 'plw', 'pmf', 'pny', 'poh-dialect_eastern', 'poh-dialect_western', 'poi', 'pol', 'por', 'poy', 'ppk', 'pps', 'prf', 'prk', 'prt', 'pse', 'pss', 'ptu', 'pui', 'pus', 'pwg', 'pww', 'pxm', 'qub', 'quc-dialect_central', 'quc-dialect_east', 'quc-dialect_north', 'quf', 'quh', 'qul', 'quw', 'quy', 'quz', 'qvc', 'qve', 'qvh', 'qvm', 'qvn', 'qvo', 'qvs', 'qvw', 'qvz', 'qwh', 'qxh', 'qxl', 'qxn', 'qxo', 'qxr', 'rah', 'rai', 'rap', 'rav', 'raw', 'rej', 'rel', 'rgu', 'rhg', 'rif-script_arabic', 'rif-script_latin', 'ril', 'rim', 'rjs', 'rkt', 'rmc-script_cyrillic', 'rmc-script_latin', 'rmo', 'rmy-script_cyrillic', 'rmy-script_latin', 'rng', 'rnl', 'roh-dialect_sursilv', 'roh-dialect_vallader', 'rol', 'ron', 'rop', 'rro', 'rub', 'ruf', 'rug', 'run', 'rus', 'sab', 'sag', 'sah', 'saj', 'saq', 'sas', 'sat', 'sba', 'sbd', 'sbl', 'sbp', 'sch', 'sck', 'sda', 'sea', 'seh', 'ses', 'sey', 'sgb', 'sgj', 'sgw', 'shi', 'shk', 'shn', 'sho', 'shp', 'sid', 'sig', 'sil', 'sja', 'sjm', 'sld', 'slk', 'slu', 'slv', 'sml', 'smo', 'sna', 'snd', 'sne', 'snn', 'snp', 'snw', 'som', 'soy', 'spa', 'spp', 'spy', 'sqi', 'sri', 'srm', 'srn', 'srp-script_cyrillic', 'srp-script_latin', 'srx', 'stn', 'stp', 'suc', 'suk', 'sun', 'sur', 'sus', 'suv', 'suz', 'swe', 'swh', 'sxb', 'sxn', 'sya', 'syl', 'sza', 'tac', 'taj', 'tam', 'tao', 'tap', 'taq', 'tat', 'tav', 'tbc', 'tbg', 'tbk', 'tbl', 'tby', 'tbz', 'tca', 'tcc', 'tcs', 'tcz', 'tdj', 'ted', 'tee', 'tel', 'tem', 'teo', 'ter', 'tes', 'tew', 'tex', 'tfr', 'tgj', 'tgk', 'tgl', 'tgo', 'tgp', 'tha', 'thk', 'thl', 'tih', 'tik', 'tir', 'tkr', 'tlb', 'tlj', 'tly', 'tmc', 'tmf', 'tna', 'tng', 'tnk', 'tnn', 'tnp', 'tnr', 'tnt', 'tob', 'toc', 'toh', 'tom', 'tos', 'tpi', 'tpm', 'tpp', 'tpt', 'trc', 'tri', 'trn', 'trs', 'tso', 'tsz', 'ttc', 'tte', 'ttq-script_tifinagh', 'tue', 'tuf', 'tuk-script_arabic', 'tuk-script_latin', 'tuo', 'tur', 'tvw', 'twb', 'twe', 'twu', 'txa', 'txq', 'txu', 'tye', 'tzh-dialect_bachajon', 'tzh-dialect_tenejapa', 'tzj-dialect_eastern', 'tzj-dialect_western', 'tzo-dialect_chamula', 'tzo-dialect_chenalho', 'ubl', 'ubu', 'udm', 'udu', 'uig-script_arabic', 'uig-script_cyrillic', 'ukr', 'umb', 'unr', 'upv', 'ura', 'urb', 'urd-script_arabic', 'urd-script_devanagari', 'urd-script_latin', 'urk', 'urt', 'ury', 'usp', 'uzb-script_cyrillic', 'uzb-script_latin', 'vag', 'vid', 'vie', 'vif', 'vmw', 'vmy', 'vot', 'vun', 'vut', 'wal-script_ethiopic', 'wal-script_latin', 'wap', 'war', 'waw', 'way', 'wba', 'wlo', 'wlx', 'wmw', 'wob', 'wol', 'wsg', 'wwa', 'xal', 'xdy', 'xed', 'xer', 'xho', 'xmm', 'xnj', 'xnr', 'xog', 'xon', 'xrb', 'xsb', 'xsm', 'xsr', 'xsu', 'xta', 'xtd', 'xte', 'xtm', 'xtn', 'xua', 'xuo', 'yaa', 'yad', 'yal', 'yam', 'yao', 'yas', 'yat', 'yaz', 'yba', 'ybb', 'ycl', 'ycn', 'yea', 'yka', 'yli', 'yor', 'yre', 'yua', 'yue-script_traditional', 'yuz', 'yva', 'zaa', 'zab', 'zac', 'zad', 'zae', 'zai', 'zam', 'zao', 'zaq', 'zar', 'zas', 'zav', 'zaw', 'zca', 'zga', 'zim', 'ziw', 'zlm', 'zmz', 'zne', 'zos', 'zpc', 'zpg', 'zpi', 'zpl', 'zpm', 'zpo', 'zpt', 'zpu', 'zpz', 'ztq', 'zty', 'zul', 'zyb', 'zyp', 'zza']) - - - -Switch out the language adapters by calling the ``load_adapter()`` -function for the model and ``set_target_lang()`` for the tokenizer. Pass -the target language as an input - ``"detect_language_id"`` which was -detected in the previous step. - -.. code:: ipython3 - - asr_processor.tokenizer.set_target_lang(language_id) - asr_model.load_adapter(language_id) - - -.. parsed-literal:: - - Ignored unknown kwarg option normalize - Ignored unknown kwarg option normalize - Ignored unknown kwarg option normalize - Ignored unknown kwarg option normalize - - -Use the original model for inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - inputs = asr_processor(example["audio"]["array"], sampling_rate=16_000, return_tensors="pt") - - with torch.no_grad(): - outputs = asr_model(**inputs).logits - - ids = torch.argmax(outputs, dim=-1)[0] - transcription = asr_processor.decode(ids) - print(transcription) - - -.. parsed-literal:: - - grisé par ce parfum il fit des vers en l'honneur de l'humble fleur des bois et il les récita tout haut à ses pieds une violette l'entendit elle crut qu'il ne parlait que pour elle - - -Convert to OpenVINO IR model and run inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Convert to OpenVINO IR model format with ``ov.convert_model`` function -directly. Use ``ov.save_model`` function to serialize the result of -conversion. For convenience of further use, we will create a function -for these purposes. - -.. code:: ipython3 - - asr_model_xml_path_template = "models/ov_asr_{}_model.xml" - - - def get_asr_model(model_path_template, language_id, compiled=True): - input_values = torch.zeros([1, MAX_SEQ_LENGTH], dtype=torch.float) - model_path = Path(model_path_template.format(language_id)) - - asr_processor.tokenizer.set_target_lang(language_id) - if not model_path.exists() and model_path_template == asr_model_xml_path_template: - asr_model.load_adapter(language_id) - - model_path.parent.mkdir(parents=True, exist_ok=True) - converted_model = ov.convert_model(asr_model, example_input={"input_values": input_values}) - ov.save_model(converted_model, model_path) - if not compiled: - return converted_model - - if compiled: - return core.compile_model(model_path, device_name=device.value) - return core.read_model(model_path) - - - compiled_asr_model = get_asr_model(asr_model_xml_path_template, language_id) - -Run inference. - -.. code:: ipython3 - - def recognize_audio(compiled_model, src_audio): - inputs = asr_processor(src_audio, sampling_rate=16_000, return_tensors="pt") - outputs = compiled_model(inputs["input_values"])[0] - - ids = torch.argmax(torch.from_numpy(outputs), dim=-1)[0] - transcription = asr_processor.decode(ids) - - return transcription - - - transcription = recognize_audio(compiled_asr_model, example["audio"]["array"]) - print("Original text:", example["transcript"]) - print("Transcription:", transcription) - - -.. parsed-literal:: - - Original text: grisé par ce parfum il fit des vers en l'honneur de l'humble fleur des bois et il les récita tout haut à ses pieds une violette l'entendit elle crut qu'il ne parlait que pour elle - Transcription: grisé par ce parfum il fit des vers en l'honneur de l'humble fleur des bois et il les récita tout haut à ses pieds une violette l'entendit elle crut qu'il ne parlait que pour elle - - -Quantization ------------- - - - -`NNCF `__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -The optimization process contains the following steps: - -1. Create a calibration dataset for quantization. -2. Run ``nncf.quantize()`` to obtain quantized models. -3. Serialize quantized ``INT8`` model using ``openvino.save_model()`` - function. - -.. - - Note: Quantization is time and memory consuming operation. Running - quantization code below may take some time. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - compiled_quantized_lid_model = None - quantized_asr_model_xml_path_template = None - - to_quantize = quantization_widget() - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -Let’s load skip magic extension to skip quantization if to_quantize is -not selected - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - import requests - - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - %load_ext skip_kernel_extension - -Preparing calibration dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Select the language to quantize the model for: - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from IPython.display import display - - display(SAMPLE_LANG) - -Load dev split of the same -`MLS `__ -dataset for the selected language. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - mls_dataset = iter(load_dataset("facebook/multilingual_librispeech", SAMPLE_LANG.value, split="dev", streaming=True, trust_remote_code=True)) - example = next(mls_dataset) - -Create calibration dataset for quantization. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - CALIBRATION_DATASET_SIZE = 5 - - calibration_data = [] - for i in range(CALIBRATION_DATASET_SIZE): - data = asr_processor(next(mls_dataset)['audio']['array'], sampling_rate=16_000, return_tensors="np") - calibration_data.append(data["input_values"]) - -Language identification model quantization -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Run LID model quantization. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import nncf - - quantized_lid_model_xml_path = Path(str(lid_model_xml_path).replace(".xml", "_quantized.xml")) - - if not quantized_lid_model_xml_path.exists(): - quantized_lid_model = nncf.quantize( - get_lid_model(lid_model_xml_path, compiled=False), - calibration_dataset=nncf.Dataset(calibration_data), - subset_size=len(calibration_data), - model_type=nncf.ModelType.TRANSFORMER - ) - ov.save_model(quantized_lid_model, quantized_lid_model_xml_path) - compiled_quantized_lid_model = core.compile_model(quantized_lid_model, device_name=device.value) - else: - compiled_quantized_lid_model = get_lid_model(quantized_lid_model_xml_path) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, openvino - - -.. parsed-literal:: - - Statistics collection: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:06<00:00, 1.24s/it] - Applying Smooth Quant: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 291/291 [00:18<00:00, 15.34it/s] - - -.. parsed-literal:: - - INFO:nncf:144 ignored nodes was found by name in the NNCFGraph - - -.. parsed-literal:: - - Statistics collection: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:18<00:00, 3.65s/it] - Applying Fast Bias correction: 100%|██████████████████████████████████████████████████████████████████████████████████████| 298/298 [05:09<00:00, 1.04s/it] - - -Detect language with the quantized model. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - language_id = detect_language(compiled_quantized_lid_model, example['audio']['array']) - print("Detected language:", language_id) - - -.. parsed-literal:: - - Detected language: fra - - -Speech recognition model quantization -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Run ASR model quantization. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - quantized_asr_model_xml_path_template = asr_model_xml_path_template.replace(".xml", "_quantized.xml") - quantized_asr_model_xml_path = Path(quantized_asr_model_xml_path_template.format(language_id)) - - if not quantized_asr_model_xml_path.exists(): - quantized_asr_model = nncf.quantize( - get_asr_model(asr_model_xml_path_template, language_id, compiled=False), - calibration_dataset=nncf.Dataset(calibration_data), - subset_size=len(calibration_data), - model_type=nncf.ModelType.TRANSFORMER - ) - ov.save_model(quantized_asr_model, quantized_asr_model_xml_path) - compiled_quantized_asr_model = core.compile_model(quantized_asr_model, device_name=device.value) - else: - compiled_quantized_asr_model = get_asr_model(quantized_asr_model_xml_path_template, language_id) - - -.. parsed-literal:: - - Ignored unknown kwarg option normalize - Ignored unknown kwarg option normalize - Ignored unknown kwarg option normalize - Ignored unknown kwarg option normalize - - -.. parsed-literal:: - - Statistics collection: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00, 1.17s/it] - Applying Smooth Quant: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 290/290 [00:17<00:00, 16.39it/s] - - -.. parsed-literal:: - - INFO:nncf:144 ignored nodes was found by name in the NNCFGraph - - -.. parsed-literal:: - - Statistics collection: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:19<00:00, 3.93s/it] - Applying Fast Bias correction: 100%|██████████████████████████████████████████████████████████████████████████████████████| 393/393 [05:22<00:00, 1.22it/s] - - -Run transcription with quantized model and compare the result to the one -produced by original model. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - compiled_asr_model = get_asr_model(asr_model_xml_path_template, language_id) - transcription_original = recognize_audio(compiled_asr_model, example['audio']['array']) - transcription_quantized = recognize_audio(compiled_quantized_asr_model, example['audio']['array']) - print("Transcription by original model: ", transcription_original) - print("Transcription by quantized model:", transcription_quantized) - - -.. parsed-literal:: - - Ignored unknown kwarg option normalize - Ignored unknown kwarg option normalize - Ignored unknown kwarg option normalize - Ignored unknown kwarg option normalize - Transcription by original model: le salon était de la plus haute magnificence dorée comme la galerie de diane aux tuileries avec des tableaux à l'huile au lombri il y avait des tâches claires dans ces tableaux julien apprit plus tard que les sujets avaient semblé peu décent à la maîtresse du logis qui avait fait corriger les tableaux - Transcription by quantized model: le salon était de la plus haute magnificence doré comme la galerie de diane aux tuileries avec des tableaux à l'huile au lombri il y avait des tâches claires dans ces tableaux julien apprit plus tard que les sujets avaient semblé peu decent à la maîtresse du logis qui avait fait corriger les tableaux - - -Compare model size, performance and accuracy -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -First we compare model size. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - def calculate_compression_rate(model_path_ov, model_path_ov_int8, model_type): - model_size_fp32 = model_path_ov.with_suffix(".bin").stat().st_size / 10 ** 6 - model_size_int8 = model_path_ov_int8.with_suffix(".bin").stat().st_size / 10 ** 6 - print(f"{model_type} model footprint comparison:") - print(f" * FP32 IR model size: {model_size_fp32:.2f} MB") - print(f" * INT8 IR model size: {model_size_int8:.2f} MB") - return model_size_fp32, model_size_int8 - - lid_model_size_fp32, lid_model_size_int8 = \ - calculate_compression_rate(lid_model_xml_path, quantized_lid_model_xml_path, 'LID') - asr_model_size_fp32, asr_model_size_int8 = \ - calculate_compression_rate(Path(asr_model_xml_path_template.format(language_id)), quantized_asr_model_xml_path, 'ASR') - - -.. parsed-literal:: - - LID model footprint comparison: - * FP32 IR model size: 1931.81 MB - * INT8 IR model size: 968.96 MB - ASR model footprint comparison: - * FP32 IR model size: 1930.10 MB - * INT8 IR model size: 968.29 MB - - -Secondly we compare accuracy values of the original and quantized models -on a test split of MLS dataset. We rely on the Word Error Rate (WER) -metric and compute accuracy as ``(1 - WER)``. - -We also measure inference time for both language identification and -speech recognition models. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import time - from tqdm.notebook import tqdm - import numpy as np - from jiwer import wer - - TEST_DATASET_SIZE = 20 - test_dataset = load_dataset("facebook/multilingual_librispeech", SAMPLE_LANG.value, split="test", streaming=True, trust_remote_code=True) - test_dataset = test_dataset.take(TEST_DATASET_SIZE) - - def calculate_transcription_time_and_accuracy(lid_model, asr_model): - ground_truths = [] - predictions = [] - identification_time = [] - transcription_time = [] - for data_item in tqdm(test_dataset, desc="Measuring performance and accuracy", total=TEST_DATASET_SIZE): - audio = data_item["audio"]["array"] - - start_time = time.perf_counter() - detect_language(lid_model, audio) - end_time = time.perf_counter() - identification_time.append(end_time - start_time) - - start_time = time.perf_counter() - transcription = recognize_audio(asr_model, audio) - end_time = time.perf_counter() - transcription_time.append(end_time - start_time) - - ground_truths.append(data_item["transcript"]) - predictions.append(transcription) - - word_accuracy = (1 - wer(ground_truths, predictions)) * 100 - mean_identification_time = np.mean(identification_time) - mean_transcription_time = np.mean(transcription_time) - return mean_identification_time, mean_transcription_time, word_accuracy - - identification_time_fp32, transcription_time_fp32, accuracy_fp32 = \ - calculate_transcription_time_and_accuracy(compiled_lid_model, compiled_asr_model) - identification_time_int8, transcription_time_int8, accuracy_int8 = \ - calculate_transcription_time_and_accuracy(compiled_quantized_lid_model, compiled_quantized_asr_model) - print(f"LID model footprint reduction: {lid_model_size_fp32 / lid_model_size_int8:.3f}") - print(f"ASR model footprint reduction: {asr_model_size_fp32 / asr_model_size_int8:.3f}") - print(f"Language identification performance speedup: {identification_time_fp32 / identification_time_int8:.3f}") - print(f"Language transcription performance speedup: {transcription_time_fp32 / transcription_time_int8:.3f}") - print(f"Transcription word accuracy. FP32: {accuracy_fp32:.2f}%. INT8: {accuracy_int8:.2f}%. Accuracy drop :{accuracy_fp32 - accuracy_int8:.2f}%.") - - - -.. parsed-literal:: - - Measuring performance and accuracy: 0%| | 0/20 [00:00`__ models provide the means -through which you can implement a semantic search engine with a few -dozen lines of code. The CLIP model has been trained on millions of -pairs of text and images, encoding semantics from images and text -combined. Using CLIP, you can provide a text query and CLIP will return -the images most related to the query. - -In this tutorial, we consider how to use MobileCLIP to implement a -visual content search engine for finding relevant frames in video. #### -Table of contents: - -- `Prerequisites <#prerequisites>`__ -- `Select model <#select-model>`__ -- `Run model inference <#run-model-inference>`__ - - - `Prepare image gallery <#prepare-image-gallery>`__ - - `Prepare model <#prepare-model>`__ - - `Perform search <#perform-search>`__ - -- `Convert Model to OpenVINO Intermediate Representation - format <#convert-model-to-openvino-intermediate-representation-format>`__ -- `Run OpenVINO model inference <#run-openvino-model-inference>`__ - - - `Select device for image - encoder <#select-device-for-image-encoder>`__ - - `Select device for text - encoder <#select-device-for-text-encoder>`__ - - `Perform search <#perform-search>`__ - -- `Interactive Demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - from pathlib import Path - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("mobileclip-video-search.ipynb") - -.. code:: ipython3 - - from cmd_helper import clone_repo - - - clone_repo("https://github.com/apple/ml-mobileclip.git") - -.. code:: ipython3 - - %pip install -q "./ml-mobileclip" --no-deps - - %pip install -q "clip-benchmark>=1.4.0" "datasets>=2.8.0" "open-clip-torch>=2.20.0" "timm>=0.9.5" "torch>=2.5.0" "torchvision>=0.20.0" --extra-index-url https://download.pytorch.org/whl/cpu - - %pip install -q "matplotlib>=3.4" "Pillow" "altair" "pandas" "tqdm" "opencv-python>=4.9" "gradio>=4.19" - %pip install -q "--no-deps salesforce-lavis==1.0.2" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "contexttimer" "einops>=0.4.1" decord omegaconf sentencepiece timm spacy --extra-index-url https://download.pytorch.org/whl/cpu - -.. code:: ipython3 - - %pip install -q "openvino>=2024.5.0" - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" "transformers>=4.45" "tokenizers>=0.20" --extra-index-url https://download.pytorch.org/whl/cpu - -Select model ------------- - - - -For starting work, we should select model that will be used in our -demonstration. By default, we will use the MobileCLIP model, but for -comparison purposes, you can select different models among: - -- **CLIP** - CLIP (Contrastive Language-Image Pre-Training) is a neural - network trained on various (image, text) pairs. It can be instructed - in natural language to predict the most relevant text snippet, given - an image, without directly optimizing for the task. CLIP uses a - `ViT `__ like transformer to get - visual features and a causal language model to get the text features. - The text and visual features are then projected into a latent space - with identical dimensions. The dot product between the projected - image and text features is then used as a similarity score. You can - find more information about this model in the `research - paper `__, `OpenAI - blog `__, `model - card `__ and - GitHub `repository `__. -- **SigLIP** - The SigLIP model was proposed in `Sigmoid Loss for - Language Image Pre-Training `__. - SigLIP proposes to replace the loss function used in - `CLIP `__ (Contrastive Language–Image - Pre-training) by a simple pairwise sigmoid loss. This results in - better performance in terms of zero-shot classification accuracy on - ImageNet. You can find more information about this model in the - `research paper `__ and `GitHub - repository `__, -- **MobileCLIP** - MobileCLIP – a new family of efficient image-text - models optimized for runtime performance along with a novel and - efficient training approach, namely multi-modal reinforced training. - The smallest variant MobileCLIP-S0 obtains similar zero-shot - performance as OpenAI’s CLIP ViT-b16 model while being several times - faster and 2.8x smaller. More details about model can be found in - `research paper `__ and `GitHub - repository `__. -- **BLIP-2** - BLIP2 was introduced in the paper `BLIP-2: Bootstrapping - Language-Image Pre-training with Frozen Image Encoders and Large - Language Models `__ by Li et - al. and first released in this - `repository `__. - It is a generic and efficient pre-training strategy that easily - harvests development of pretrained vision models and large language - models (LLMs) for vision-language pretraining. BLIP-2 consists of 3 - models: a CLIP-like image encoder, a Querying Transformer (Q-Former) - and a large language model. - -.. code:: ipython3 - - from pathlib import Path - - import ipywidgets as widgets - - - model_dir = Path("checkpoints") - - - def default_image_probs(image_features, text_features): - image_probs = (100.0 * text_features @ image_features.T).softmax(dim=-1) - return image_probs - - - def blip2_image_probs(image_features, text_features): - image_probs = image_features[:, 0, :] @ text_features[:, 0, :].t() - return image_probs - - - supported_models = { - "MobileCLIP": { - "mobileclip_s0": { - "model_name": "mobileclip_s0", - "pretrained": model_dir / "mobileclip_s0.pt", - "url": "https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_s0.pt", - "image_size": 256, - "image_probs": default_image_probs, - }, - "mobileclip_s1": { - "model_name": "mobileclip_s1", - "pretrained": model_dir / "mobileclip_s1.pt", - "url": "https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_s1.pt", - "image_size": 256, - "image_probs": default_image_probs, - }, - "mobileclip_s2": { - "model_name": "mobileclip_s0", - "pretrained": model_dir / "mobileclip_s2.pt", - "url": "https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_s2.pt", - "image_size": 256, - "image_probs": default_image_probs, - }, - "mobileclip_b": { - "model_name": "mobileclip_b", - "pretrained": model_dir / "mobileclip_b.pt", - "url": "https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_b.pt", - "image_size": 224, - "image_probs": default_image_probs, - }, - "mobileclip_blt": { - "model_name": "mobileclip_b", - "pretrained": model_dir / "mobileclip_blt.pt", - "url": "https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_blt.pt", - "image_size": 224, - "image_probs": default_image_probs, - }, - }, - "CLIP": { - "clip-vit-b-32": { - "model_name": "ViT-B-32", - "pretrained": "laion2b_s34b_b79k", - "image_size": 224, - "image_probs": default_image_probs, - }, - "clip-vit-b-16": { - "model_name": "ViT-B-16", - "pretrained": "openai", - "image_size": 224, - "image_probs": default_image_probs, - }, - "clip-vit-l-14": { - "model_name": "ViT-L-14", - "pretrained": "datacomp_xl_s13b_b90k", - "image_size": 224, - "image_probs": default_image_probs, - }, - "clip-vit-h-14": { - "model_name": "ViT-H-14", - "pretrained": "laion2b_s32b_b79k", - "image_size": 224, - "image_probs": default_image_probs, - }, - }, - "SigLIP": { - "siglip-vit-b-16": { - "model_name": "ViT-B-16-SigLIP", - "pretrained": "webli", - "image_size": 224, - "image_probs": default_image_probs, - }, - "siglip-vit-l-16": { - "model_name": "ViT-L-16-SigLIP-256", - "pretrained": "webli", - "image_size": 256, - "image_probs": default_image_probs, - }, - }, - "Blip2": { - "blip2_feature_extractor": { - "model_name": "blip2_feature_extractor", - "pretrained": "pretrain_vitL", - "image_size": 224, - "image_probs": blip2_image_probs, - }, - }, - } - - - model_type = widgets.Dropdown(options=supported_models.keys(), default="MobileCLIP", description="Model type:") - model_type - - - - -.. parsed-literal:: - - Dropdown(description='Model type:', options=('MobileCLIP', 'CLIP', 'SigLIP', 'Blip2'), value='MobileCLIP') - - - -.. code:: ipython3 - - available_models = supported_models[model_type.value] - - model_checkpoint = widgets.Dropdown( - options=available_models.keys(), - default=list(available_models), - description="Model:", - ) - - model_checkpoint - - - - -.. parsed-literal:: - - Dropdown(description='Model:', options=('blip2_feature_extractor',), value='blip2_feature_extractor') - - - -.. code:: ipython3 - - from notebook_utils import download_file, device_widget - - model_config = available_models[model_checkpoint.value] - -Run model inference -------------------- - - - -Now, let’s see model in action. We will try to find image, where some -specific object is represented using embeddings. Embeddings are a -numeric representation of data such as text and images. The model -learned to encode semantics about the contents of images in embedding -format. This ability turns the model into a powerful for solving various -tasks including image-text retrieval. To reach our goal we should: - -1. Calculate embeddings for all of the images in our dataset; -2. Calculate a text embedding for a user query (i.e. “black dog” or - “car”); -3. Compare the text embedding to the image embeddings to find related - embeddings. - -The closer two embeddings are, the more similar the contents they -represent are. - -Prepare image gallery -~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from typing import List - import matplotlib.pyplot as plt - import numpy as np - from PIL import Image - - - def visualize_result(images: List, query: str = "", selected: List[int] = None): - """ - Utility function for visualization classification results - params: - images (List[Image]) - list of images for visualization - query (str) - title for visualization - selected (List[int]) - list of selected image indices from images - returns: - matplotlib.Figure - """ - figsize = (20, 5) - fig, axs = plt.subplots(1, 4, figsize=figsize, sharex="all", sharey="all") - fig.patch.set_facecolor("white") - list_axes = list(axs.flat) - if query: - fig.suptitle(query, fontsize=20) - for idx, a in enumerate(list_axes): - a.set_xticklabels([]) - a.set_yticklabels([]) - a.get_xaxis().set_visible(False) - a.get_yaxis().set_visible(False) - a.grid(False) - a.imshow(images[idx]) - if selected is not None and idx not in selected: - mask = np.ones_like(np.array(images[idx])) - a.imshow(mask, "jet", interpolation="none", alpha=0.75) - return fig - - - images_urls = [ - "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/282ce53e-912d-41aa-ab48-2a001c022d74", - "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/9bb40168-82b5-4b11-ada6-d8df104c736c", - "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/0747b6db-12c3-4252-9a6a-057dcf8f3d4e", - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bricks.png", - ] - image_names = ["red_panda.png", "cat.png", "raccoon.png", "dog.png"] - sample_path = Path("data") - sample_path.mkdir(parents=True, exist_ok=True) - - images = [] - for image_name, image_url in zip(image_names, images_urls): - image_path = sample_path / image_name - if not image_path.exists(): - download_file(image_url, filename=image_name, directory=sample_path) - images.append(Image.open(image_path).convert("RGB").resize((640, 420))) - - input_labels = ["cat"] - text_descriptions = [f"This is a photo of a {label}" for label in input_labels] - - visualize_result(images, "image gallery"); - - - -.. image:: mobileclip-video-search-with-output_files/mobileclip-video-search-with-output_12_0.png - - -Prepare model -~~~~~~~~~~~~~ - - - -The code bellow download model weights, create model class instance and -preprocessing utilities - -.. code:: ipython3 - - import torch - - - class Blip2Model(torch.nn.Module): - def __init__(self, ln_vision, visual_encoder, query_tokens, q_former, vision_proj, text_proj, tokenizer): - super().__init__() - self.ln_vision = ln_vision - self.visual_encoder = visual_encoder - self.query_tokens = query_tokens - self.q_former = q_former - self.vision_proj = vision_proj - self.text_proj = text_proj - self.tok = tokenizer - - def encode_image(self, image): - image_embeds_frozen = self.ln_vision(self.visual_encoder(image)) - image_embeds_frozen = image_embeds_frozen.float() - image_atts = torch.ones(image_embeds_frozen.size()[:-1], dtype=torch.long) - query_tokens = self.query_tokens.expand(image_embeds_frozen.shape[0], -1, -1) - - query_output = self.q_former.bert( - query_embeds=query_tokens, - encoder_hidden_states=image_embeds_frozen, - encoder_attention_mask=image_atts, - return_dict=True, - ) - image_embeds = query_output.last_hidden_state - image_features = self.vision_proj(image_embeds) - - return image_features - - def encode_text(self, input_ids, attention_mask): - text_output = self.q_former.bert( - input_ids, - attention_mask=attention_mask, - return_dict=True, - ) - text_embeds = text_output.last_hidden_state - text_features = self.text_proj(text_embeds) - return text_features - - def tokenizer(self, text_descriptions): - input_ids = self.tok(text_descriptions, return_tensors="pt", padding=True).input_ids - attention_mask = self.tok(text_descriptions, return_tensors="pt", padding=True).attention_mask - text = {"input_ids": input_ids, "attention_mask": attention_mask} - return text - -.. code:: ipython3 - - import torch - import time - import mobileclip - import open_clip - - # instantiate model - model_name = model_config["model_name"] - pretrained = model_config["pretrained"] - - if model_type.value == "MobileCLIP": - model_dir.mkdir(exist_ok=True) - model_url = model_config["url"] - download_file(model_url, directory=model_dir) - model, _, preprocess = mobileclip.create_model_and_transforms(model_name, pretrained=pretrained) - tokenizer = mobileclip.get_tokenizer(model_name) - elif model_type.value == "Blip2": - from lavis.models import load_model_and_preprocess - - model, vis_processors, txt_processors = load_model_and_preprocess(name=model_name, model_type=pretrained, is_eval=True) - model = Blip2Model(model.ln_vision, model.visual_encoder, model.query_tokens, model.Qformer, model.vision_proj, model.text_proj, model.tokenizer) - preprocess = vis_processors["eval"] - tokenizer = model.tokenizer - else: - model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained) - tokenizer = open_clip.get_tokenizer(model_name) - -Perform search -~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - image_tensor = torch.stack([preprocess(image) for image in images]) - text = tokenizer(text_descriptions) - image_probs_function = model_config["image_probs"] - - with torch.no_grad(): - # calculate image embeddings - image_encoding_start = time.perf_counter() - image_features = model.encode_image(image_tensor) - image_encoding_end = time.perf_counter() - print(f"Image encoding took {image_encoding_end - image_encoding_start:.3} ms") - # calculate text embeddings - text_encoding_start = time.perf_counter() - text_features = model.encode_text(**text) if model_type.value == "Blip2" else model.encode_text(text) - text_encoding_end = time.perf_counter() - print(f"Text encoding took {text_encoding_end - text_encoding_start:.3} ms") - - image_features /= image_features.norm(dim=-1, keepdim=True) - text_features /= text_features.norm(dim=-1, keepdim=True) - image_probs = image_probs_function(image_features, text_features) - selected_image = [torch.argmax(image_probs).item()] - - visualize_result(images, input_labels[0], selected_image); - - -.. parsed-literal:: - - Image encoding took 0.731 ms - Text encoding took 0.0229 ms - - - -.. image:: mobileclip-video-search-with-output_files/mobileclip-video-search-with-output_17_1.png - - -Convert Model to OpenVINO Intermediate Representation format ------------------------------------------------------------- - - - -For best results with OpenVINO, it is recommended to convert the model -to OpenVINO IR format. OpenVINO supports PyTorch via Model conversion -API. To convert the PyTorch model to OpenVINO IR format we will use -``ov.convert_model`` of `model conversion -API `__. -The ``ov.convert_model`` Python function returns an OpenVINO Model -object ready to load on the device and start making predictions. - -Our model consist from 2 parts - image encoder and text encoder that can -be used separately. Let’s convert each part to OpenVINO. - -.. code:: ipython3 - - import types - import torch.nn.functional as F - - - def se_block_forward(self, inputs): - """Apply forward pass.""" - b, c, h, w = inputs.size() - x = F.avg_pool2d(inputs, kernel_size=[8, 8]) - x = self.reduce(x) - x = F.relu(x) - x = self.expand(x) - x = torch.sigmoid(x) - x = x.view(-1, c, 1, 1) - return inputs * x - -.. code:: ipython3 - - import openvino as ov - import gc - - ov_models_dir = Path("ov_models") - ov_models_dir.mkdir(exist_ok=True) - - image_encoder_path = ov_models_dir / f"{model_checkpoint.value}_im_encoder.xml" - - if not image_encoder_path.exists(): - if "mobileclip_s" in model_name: - model.image_encoder.model.conv_exp.se.forward = types.MethodType(se_block_forward, model.image_encoder.model.conv_exp.se) - model.forward = model.encode_image - ov_image_encoder = ov.convert_model( - model, - example_input=image_tensor, - input=[-1, 3, image_tensor.shape[2], image_tensor.shape[3]], - ) - ov.save_model(ov_image_encoder, image_encoder_path) - del ov_image_encoder - gc.collect() - - text_encoder_path = ov_models_dir / f"{model_checkpoint.value}_text_encoder.xml" - - if not text_encoder_path.exists(): - model.forward = model.encode_text - if model_type.value == "Blip2": - ov_text_encoder = ov.convert_model(model, example_input=text) - else: - ov_text_encoder = ov.convert_model(model, example_input=text, input=[-1, text.shape[1]]) - ov.save_model(ov_text_encoder, text_encoder_path) - del ov_text_encoder - gc.collect() - - del model - gc.collect(); - -Run OpenVINO model inference ----------------------------- - - - -Select device for image encoder -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - core = ov.Core() - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - ov_compiled_image_encoder = core.compile_model(image_encoder_path, device.value) - ov_compiled_image_encoder(image_tensor); - -Select device for text encoder -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - ov_compiled_text_encoder = core.compile_model(text_encoder_path, device.value) - ov_compiled_text_encoder(text); - -Perform search -~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - image_encoding_start = time.perf_counter() - image_features = torch.from_numpy(ov_compiled_image_encoder(image_tensor)[0]) - image_encoding_end = time.perf_counter() - print(f"Image encoding took {image_encoding_end - image_encoding_start:.3} ms") - text_encoding_start = time.perf_counter() - text_features = torch.from_numpy(ov_compiled_text_encoder(text)[0]) - text_encoding_end = time.perf_counter() - print(f"Text encoding took {text_encoding_end - text_encoding_start:.3} ms") - image_features /= image_features.norm(dim=-1, keepdim=True) - text_features /= text_features.norm(dim=-1, keepdim=True) - - image_probs = image_probs_function(image_features, text_features) - selected_image = [torch.argmax(image_probs).item()] - - visualize_result(images, input_labels[0], selected_image); - - -.. parsed-literal:: - - Image encoding took 0.202 ms - Text encoding took 0.0043 ms - - - -.. image:: mobileclip-video-search-with-output_files/mobileclip-video-search-with-output_28_1.png - - -(optional) Translation model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Since all text embedding models in this notebook natively supports input -in English only, we can insert a translation model in this pipeline to -support searching in Chinese. - -- **opus-mt-zh-en t** - This is a translation model developed by - Language Technology Research Group at the University of Helsinki. It - supports Chinese as source Language and English as target Language - `model card `__. - -.. code:: ipython3 - - from pathlib import Path - - cn2en_trans_model_path = "ov_models/cn2en_trans_model" - cn2en_trans_model_id = "Helsinki-NLP/opus-mt-zh-en" - - if not Path(cn2en_trans_model_path).exists(): - !optimum-cli export openvino --model {cn2en_trans_model_id} --task text2text-generation-with-past --trust-remote-code {cn2en_trans_model_path} - -.. code:: ipython3 - - from transformers import AutoTokenizer - from optimum.intel import OVModelForSeq2SeqLM - - tr_tokenizer = AutoTokenizer.from_pretrained(cn2en_trans_model_path) - tr_model = OVModelForSeq2SeqLM.from_pretrained(cn2en_trans_model_path) - - -.. parsed-literal:: - - /home2/ethan/intel/openvino_notebooks/openvino_env/lib/python3.10/site-packages/transformers/models/marian/tokenization_marian.py:175: UserWarning: Recommended: pip install sacremoses. - warnings.warn("Recommended: pip install sacremoses.") - Compiling the encoder to CPU ... - Compiling the decoder to CPU ... - Compiling the decoder to CPU ... - - -Interactive Demo ----------------- - - - -In this part, you can try different supported by tutorial models in -searching frames in the video by text query or image. Upload video and -provide text query or reference image for search and model will find the -most relevant frames according to provided query. You can also try -querying in Chinese, and translation model will be triggered -automatically for Chinese-to-English translation. Please note, different -models can require different optimal threshold for search. - -.. code:: ipython3 - - import altair as alt - import cv2 - import pandas as pd - import torch - from PIL import Image - from torch.utils.data import DataLoader, Dataset - from torchvision.transforms.functional import to_pil_image, to_tensor - from torchvision.transforms import ( - CenterCrop, - Compose, - InterpolationMode, - Resize, - ToTensor, - ) - from open_clip.transform import image_transform - from typing import Optional - - - current_device = device.value - current_model = image_encoder_path.name.split("_im_encoder")[0] - - available_converted_models = [model_file.name.split("_im_encoder")[0] for model_file in ov_models_dir.glob("*_im_encoder.xml")] - available_devices = list(core.available_devices) + ["AUTO"] - - if not (sample_path / "car-detection.mp4").exists(): - download_file( - "https://storage.openvinotoolkit.org/data/test_data/videos/car-detection.mp4", - directory=sample_path, - ) - if not Path("coco.mp4").exists(): - download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/Coco%20Walking%20in%20Berkeley.mp4", - directory=sample_path, - filename="coco.mp4", - ) - - - def is_english(text): - for char in text: - if not char.isascii(): - return False - return True - - - def translate(text): - if tr_tokenizer: - t = tr_tokenizer(text, return_tensors="pt") - r = tr_model.generate(**t) - text = tr_tokenizer.decode(r[0][1:-1]) - return text - - - def get_preprocess_probs_tokenizer(model_name): - if "mobileclip" in model_name: - resolution = supported_models["MobileCLIP"][model_name]["image_size"] - resize_size = resolution - centercrop_size = resolution - aug_list = [ - Resize( - resize_size, - interpolation=InterpolationMode.BILINEAR, - ), - CenterCrop(centercrop_size), - ToTensor(), - ] - preprocess = Compose(aug_list) - tokenizer = mobileclip.get_tokenizer(supported_models["MobileCLIP"][model_name]["model_name"]) - image_probs = default_image_probs - elif "blip2" in model_name: - from lavis.models import load_model_and_preprocess - - model, vis_processors, txt_processors = load_model_and_preprocess(name=model_name, model_type=pretrained, is_eval=True) - model = Blip2Model(model.ln_vision, model.visual_encoder, model.query_tokens, model.Qformer, model.vision_proj, model.text_proj, model.tokenizer) - preprocess = vis_processors["eval"] - tokenizer = model.tokenizer - image_probs = blip2_image_probs - else: - model_configs = supported_models["SigLIP"] if "siglip" in model_name else supported_models["CLIP"] - resize_size = model_configs[model_name]["image_size"] - preprocess = image_transform((resize_size, resize_size), is_train=False, resize_mode="longest") - tokenizer = open_clip.get_tokenizer(model_configs[model_name]["model_name"]) - image_probs = default_image_probs - - return preprocess, image_probs, tokenizer - - - def run( - path: str, - text_search: str, - image_search: Optional[Image.Image], - model_name: str, - device: str, - thresh: float, - stride: int, - batch_size: int, - ): - assert path, "An input video should be provided" - assert text_search is not None or image_search is not None, "A text or image query should be provided" - global current_model - global current_device - global preprocess - global tokenizer - global ov_compiled_image_encoder - global ov_compiled_text_encoder - global image_probs_function - - if current_model != model_name or device != current_device: - ov_compiled_image_encoder = core.compile_model(ov_models_dir / f"{model_name}_im_encoder.xml", device) - ov_compiled_text_encoder = core.compile_model(ov_models_dir / f"{model_name}_text_encoder.xml", device) - preprocess, image_probs_function, tokenizer = get_preprocess_probs_tokenizer(model_name) - current_model = model_name - current_device = device - # Load video - dataset = LoadVideo(path, transforms=preprocess, vid_stride=stride) - dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0) - - # Get image query features - if image_search: - image = preprocess(image_search).unsqueeze(0) - query_features = torch.from_numpy(ov_compiled_image_encoder(image)[0]) - query_features /= query_features.norm(dim=-1, keepdim=True) - # Get text query features - else: - if not is_english(text_search): - text_search = translate(text_search) - print(f"Translated input text: {text_search}") - # Tokenize search phrase - text = tokenizer([text_search]) - # Encode text query - query_features = torch.from_numpy(ov_compiled_text_encoder(text)[0]) - query_features /= query_features.norm(dim=-1, keepdim=True) - # Encode each frame and compare with query features - matches = [] - matches_probs = [] - res = pd.DataFrame(columns=["Frame", "Timestamp", "Similarity"]) - for image, orig, frame, timestamp in dataloader: - with torch.no_grad(): - image_features = torch.from_numpy(ov_compiled_image_encoder(image)[0]) - - image_features /= image_features.norm(dim=-1, keepdim=True) - probs = image_probs_function(image_features, query_features) - probs = probs.cpu().numpy().squeeze(1) if "blip2" in model_name else probs[0] - # Save frame similarity values - df = pd.DataFrame( - { - "Frame": frame.tolist(), - "Timestamp": torch.round(timestamp / 1000, decimals=2).tolist(), - "Similarity": probs.tolist(), - } - ) - res = pd.concat([res, df]) - - # Check if frame is over threshold - for i, p in enumerate(probs): - if p > thresh: - matches.append(to_pil_image(orig[i])) - matches_probs.append(p) - - print(f"Frames: {frame.tolist()} - Probs: {probs}") - - # Create plot of similarity values - lines = ( - alt.Chart(res) - .mark_line(color="firebrick") - .encode( - alt.X("Timestamp", title="Timestamp (seconds)"), - alt.Y("Similarity", scale=alt.Scale(zero=False)), - ) - ).properties(width=600) - rule = alt.Chart().mark_rule(strokeDash=[6, 3], size=2).encode(y=alt.datum(thresh)) - - selected_frames = np.argsort(-1 * np.array(matches_probs))[:20] - matched_sorted_frames = [matches[idx] for idx in selected_frames] - - return ( - lines + rule, - matched_sorted_frames, - ) # Only return up to 20 images to not crash the UI - - - class LoadVideo(Dataset): - def __init__(self, path, transforms, vid_stride=1): - self.transforms = transforms - self.vid_stride = vid_stride - self.cur_frame = 0 - self.cap = cv2.VideoCapture(path) - self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT) / self.vid_stride) - - def __getitem__(self, _): - # Read video - # Skip over frames - for _ in range(self.vid_stride): - self.cap.grab() - self.cur_frame += 1 - - # Read frame - _, img = self.cap.retrieve() - timestamp = self.cap.get(cv2.CAP_PROP_POS_MSEC) - - # Convert to PIL - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - img = Image.fromarray(np.uint8(img)) - - # Apply transforms - img_t = self.transforms(img) - - return img_t, to_tensor(img), self.cur_frame, timestamp - - def __len__(self): - return self.total_frames - - -.. parsed-literal:: - - 'data/car-detection.mp4' already exists. - 'data/coco.mp4' already exists. - - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/mobileclip-video-search/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo, Option - - demo = make_demo( - run=run, - model_option=Option(choices=available_converted_models, value=model_checkpoint.value), - device_option=Option(choices=available_devices, value=device.value), - ) - - try: - demo.launch(debug=True) - except Exception: - demo.launch(share=True, debug=True) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/mobileclip-video-search-with-output_files/mobileclip-video-search-with-output_12_0.png b/docs/notebooks/mobileclip-video-search-with-output_files/mobileclip-video-search-with-output_12_0.png deleted file mode 100644 index fd979c064620ab..00000000000000 --- a/docs/notebooks/mobileclip-video-search-with-output_files/mobileclip-video-search-with-output_12_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1da16a568017a5b7968aa0bba55d2d6b0880a564d9e072754d31c27cd591e075 -size 627462 diff --git a/docs/notebooks/mobileclip-video-search-with-output_files/mobileclip-video-search-with-output_17_1.png b/docs/notebooks/mobileclip-video-search-with-output_files/mobileclip-video-search-with-output_17_1.png deleted file mode 100644 index 1b72759105dca1..00000000000000 --- a/docs/notebooks/mobileclip-video-search-with-output_files/mobileclip-video-search-with-output_17_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0d2868bca6af3beb595a10db499185336eb13b7d547ada3408fcf7f071b47ebb -size 449871 diff --git a/docs/notebooks/mobileclip-video-search-with-output_files/mobileclip-video-search-with-output_28_1.png b/docs/notebooks/mobileclip-video-search-with-output_files/mobileclip-video-search-with-output_28_1.png deleted file mode 100644 index 1b72759105dca1..00000000000000 --- a/docs/notebooks/mobileclip-video-search-with-output_files/mobileclip-video-search-with-output_28_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0d2868bca6af3beb595a10db499185336eb13b7d547ada3408fcf7f071b47ebb -size 449871 diff --git a/docs/notebooks/model-server-with-output.rst b/docs/notebooks/model-server-with-output.rst deleted file mode 100644 index 404e9711292ca6..00000000000000 --- a/docs/notebooks/model-server-with-output.rst +++ /dev/null @@ -1,960 +0,0 @@ -Hello Model Server -================== - -Introduction to OpenVINO™ Model Server (OVMS). - -What is Model Serving? ----------------------- - -A model server hosts models and makes them accessible to software -components over standard network protocols. A client sends a request to -the model server, which performs inference and sends a response back to -the client. Model serving offers many advantages for efficient model -deployment: - -- Remote inference enables using lightweight clients with only the - necessary functions to perform API calls to edge or cloud - deployments. -- Applications are independent of the model framework, hardware device, - and infrastructure. -- Client applications in any programming language that supports REST or - gRPC calls can be used to run inference remotely on the model server. -- Clients require fewer updates since client libraries change very - rarely. -- Model topology and weights are not exposed directly to client - applications, making it easier to control access to the model. -- Ideal architecture for microservices-based applications and - deployments in cloud environments – including Kubernetes and - OpenShift clusters. -- Efficient resource utilization with horizontal and vertical inference - scaling. - -.. figure:: https://user-images.githubusercontent.com/91237924/215658773-4720df00-3b95-4a84-85a2-40f06138e914.png - :alt: ovms_diagram - - ovms_diagram - - -**Table of contents:** - - -- `Serving with OpenVINO Model - Server <#serving-with-openvino-model-server>`__ -- `Step 1: Prepare Docker <#step-1-prepare-docker>`__ -- `Step 2: Preparing a Model - Repository <#step-2-preparing-a-model-repository>`__ -- `Step 3: Start the Model Server - Container <#step-3-start-the-model-server-container>`__ -- `Step 4: Prepare the Example Client - Components <#step-4-prepare-the-example-client-components>`__ - - - `Prerequisites <#prerequisites>`__ - - `Imports <#imports>`__ - - `Request Model Status <#request-model-status>`__ - - `Request Model Metadata <#request-model-metadata>`__ - - `Load input image <#load-input-image>`__ - - `Request Prediction on a Numpy - Array <#request-prediction-on-a-numpy-array>`__ - - `Visualization <#visualization>`__ - -- `References <#references>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Serving with OpenVINO Model Server ----------------------------------- - -OpenVINO Model Server (OVMS) is -a high-performance system for serving models. Implemented in C++ for -scalability and optimized for deployment on Intel architectures, the -model server uses the same architecture and API as TensorFlow Serving -and KServe while applying OpenVINO for inference execution. Inference -service is provided via gRPC or REST API, making deploying new -algorithms and AI experiments easy. - -.. figure:: https://user-images.githubusercontent.com/91237924/215658767-0e0fc221-aed0-4db1-9a82-6be55f244dba.png - :alt: ovms_high_level - - ovms_high_level - -To quickly start using OpenVINO™ Model Server, follow these steps: - -Step 1: Prepare Docker ----------------------- - -Install `Docker -Engine `__, including its -`post-installation `__ -steps, on your development system. To verify installation, test it, -using the following command. When it is ready, it will display a test -image and a message. - -.. code:: ipython3 - - !docker run hello-world - - -.. parsed-literal:: - - - Hello from Docker! - This message shows that your installation appears to be working correctly. - - To generate this message, Docker took the following steps: - 1. The Docker client contacted the Docker daemon. - 2. The Docker daemon pulled the "hello-world" image from the Docker Hub. - (amd64) - 3. The Docker daemon created a new container from that image which runs the - executable that produces the output you are currently reading. - 4. The Docker daemon streamed that output to the Docker client, which sent it - to your terminal. - - To try something more ambitious, you can run an Ubuntu container with: - $ docker run -it ubuntu bash - - Share images, automate workflows, and more with a free Docker ID: - https://hub.docker.com/ - - For more examples and ideas, visit: - https://docs.docker.com/get-started/ - - - -Step 2: Preparing a Model Repository ------------------------------------- - -The models need to be placed -and mounted in a particular directory structure and according to the -following rules: - -:: - - tree models/ - models/ - ├── model1 - │ ├── 1 - │ │ ├── ir_model.bin - │ │ └── ir_model.xml - │ └── 2 - │ ├── ir_model.bin - │ └── ir_model.xml - ├── model2 - │ └── 1 - │ ├── ir_model.bin - │ ├── ir_model.xml - │ └── mapping_config.json - ├── model3 - │ └── 1 - │ └── model.onnx - ├── model4 - │ └── 1 - │ ├── model.pdiparams - │ └── model.pdmodel - └── model5 - └── 1 - └── TF_fronzen_model.pb - -- Each model should be stored in a dedicated directory, for example, - model1 and model2. - -- Each model directory should include a sub-folder for each of its - versions (1,2, etc). The versions and their folder names should be - positive integer values. - -- Note that in execution, the versions are enabled according to a - pre-defined version policy. If the client does not specify the - version number in parameters, by default, the latest version is - served. - -- Every version folder must include model files, that is, ``.bin`` and - ``.xml`` for OpenVINO IR, ``.onnx`` for ONNX, ``.pdiparams`` and - ``.pdmodel`` for Paddle Paddle, and ``.pb`` for TensorFlow. The file - name can be arbitrary. - -.. code:: ipython3 - - %pip install -q "openvino>=2024.4.0" opencv-python tqdm "matplotlib>=3.4" - -.. code:: ipython3 - - import os - - # Fetch `notebook_utils` module - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file - - dedicated_dir = Path("models") - model_name = "detection" - model_version = "1" - - MODEL_DIR = dedicated_dir / model_name / model_version - XML_PATH = "horizontal-text-detection-0001.xml" - BIN_PATH = "horizontal-text-detection-0001.bin" - os.makedirs(MODEL_DIR, exist_ok=True) - model_xml_url = ( - "https://storage.openvinotoolkit.org/repositories/open_model_zoo/2022.3/models_bin/1/horizontal-text-detection-0001/FP32/horizontal-text-detection-0001.xml" - ) - model_bin_url = ( - "https://storage.openvinotoolkit.org/repositories/open_model_zoo/2022.3/models_bin/1/horizontal-text-detection-0001/FP32/horizontal-text-detection-0001.bin" - ) - - if not (MODEL_DIR / XML_PATH).exists(): - download_file(model_xml_url, XML_PATH, MODEL_DIR) - download_file(model_bin_url, BIN_PATH, MODEL_DIR) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("model-server.ipynb") - - - -.. parsed-literal:: - - models/detection/1/horizontal-text-detection-0001.xml: 0%| | 0.00/680k [00:009000/tcp, :::37581->9000/tcp ovms - - -The required Model Server parameters are listed below. For additional -configuration options, see the `Model Server Parameters -section `__. - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - - - -.. raw:: html - -
- -.. raw:: html - -

- -–rm - -.. raw:: html - -

- -.. raw:: html - -
- -.. container:: line-block - - .. container:: line - - remove the container when exiting the Docker container - -.. raw:: html - -
- -.. raw:: html - -

- --d - -.. raw:: html - -

- -.. raw:: html - -
- -.. container:: line-block - - .. container:: line - - runs the container in the background - -.. raw:: html - -
- -.. raw:: html - -

- --v - -.. raw:: html - -

- -.. raw:: html - -
- -.. container:: line-block - - .. container:: line - - defines how to mount the model folder in the Docker container - -.. raw:: html - -
- -.. raw:: html - -

- --p - -.. raw:: html - -

- -.. raw:: html - -
- -.. container:: line-block - - .. container:: line - - exposes the model serving port outside the Docker container - -.. raw:: html - -
- -.. raw:: html - -

- -openvino/model_server:latest - -.. raw:: html - -

- -.. raw:: html - -
- -.. container:: line-block - - .. container:: line - - represents the image name; the OVMS binary is the Docker entry - point - - .. container:: line - - varies by tag and build process - see tags: - https://hub.docker.com/r/openvino/model_server/tags/ for a full - tag list. - -.. raw:: html - -
- -.. raw:: html - -

- -–model_path - -.. raw:: html - -

- -.. raw:: html - -
- -.. container:: line-block - - .. container:: line - - model location, which can be: - - .. container:: line - - a Docker container path that is mounted during start-up - - .. container:: line - - a Google Cloud Storage path gs:/// - - .. container:: line - - an AWS S3 path s3:/// - - .. container:: line - - an Azure blob path az:/// - -.. raw:: html - -
- -.. raw:: html - -

- -–model_name - -.. raw:: html - -

- -.. raw:: html - -
- -.. container:: line-block - - .. container:: line - - the name of the model in the model_path - -.. raw:: html - -
- -.. raw:: html - -

- -–port - -.. raw:: html - -

- -.. raw:: html - -
- -.. container:: line-block - - .. container:: line - - the gRPC server port - -.. raw:: html - -
- -.. raw:: html - -

- -–rest_port - -.. raw:: html - -

- -.. raw:: html - -
- -.. container:: line-block - - .. container:: line - - the REST server port - -.. raw:: html - -
- -If the serving port is already in use, please switch it to another -available port on your system. For example:\ ``-p 9020:9000`` - -Step 4: Prepare the Example Client Components ---------------------------------------------- - -OpenVINO Model Server exposes -two sets of APIs: one compatible with ``TensorFlow Serving`` and another -one, with ``KServe API``, for inference. Both APIs work on ``gRPC`` and -``REST``\ interfaces. Supporting two sets of APIs makes OpenVINO Model -Server easier to plug into existing systems the already leverage one of -these APIs for inference. This example will demonstrate how to write a -TensorFlow Serving API client for object detection. - -Prerequisites -~~~~~~~~~~~~~ - - - -Install necessary packages. - -.. code:: ipython3 - - %pip install -q ovmsclient - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - - -Imports -~~~~~~~ - - - -.. code:: ipython3 - - import cv2 - import numpy as np - import matplotlib.pyplot as plt - from ovmsclient import make_grpc_client - -Request Model Status -~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - address = "localhost:" + str(port) - - # Bind the grpc address to the client object - client = make_grpc_client(address) - model_status = client.get_model_status(model_name=model_name) - print(model_status) - - -.. parsed-literal:: - - {1: {'state': 'AVAILABLE', 'error_code': 0, 'error_message': 'OK'}} - - -Request Model Metadata -~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - model_metadata = client.get_model_metadata(model_name=model_name) - print(model_metadata) - - -.. parsed-literal:: - - {'model_version': 1, 'inputs': {'image': {'shape': [1, 3, 704, 704], 'dtype': 'DT_FLOAT'}}, 'outputs': {'boxes': {'shape': [-1, 5], 'dtype': 'DT_FLOAT'}, 'labels': {'shape': [-1], 'dtype': 'DT_INT64'}}} - - -Load input image -~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - if not Path("data/intel_rnb.jpg").exists(): - # Download the image from the openvino_notebooks storage - image_filename = download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/intel_rnb.jpg", - directory="data", - ) - - # Text detection models expect an image in BGR format. - image = cv2.imread(str(image_filename)) - fp_image = image.astype("float32") - - # Resize the image to meet network expected input sizes. - input_shape = model_metadata["inputs"]["image"]["shape"] - height, width = input_shape[2], input_shape[3] - resized_image = cv2.resize(fp_image, (height, width)) - - # Reshape to the network input shape. - input_image = np.expand_dims(resized_image.transpose(2, 0, 1), 0) - plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) - - - -.. parsed-literal:: - - data/intel_rnb.jpg: 0%| | 0.00/288k [00:00 - - - - -.. image:: model-server-with-output_files/model-server-with-output_23_2.png - - -Request Prediction on a Numpy Array -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - inputs = {"image": input_image} - - # Run inference on model server and receive the result data - boxes = client.predict(inputs=inputs, model_name=model_name)["boxes"] - - # Remove zero only boxes. - boxes = boxes[~np.all(boxes == 0, axis=1)] - print(boxes) - - -.. parsed-literal:: - - [[4.0075238e+02 8.1240105e+01 5.6262683e+02 1.3609659e+02 5.3646392e-01] - [2.6150497e+02 6.8225861e+01 3.8433078e+02 1.2111545e+02 4.7504124e-01] - [6.1611401e+02 2.8000638e+02 6.6605963e+02 3.1116574e+02 4.5030469e-01] - [2.0762566e+02 6.2619057e+01 2.3446707e+02 1.0711832e+02 3.7426147e-01] - [5.1753296e+02 5.5611102e+02 5.4918005e+02 5.8740009e+02 3.2477754e-01] - [2.2038467e+01 4.5390991e+01 1.8856328e+02 1.0215196e+02 2.9959568e-01]] - - -Visualization -~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # For each detection, the description is in the [x_min, y_min, x_max, y_max, conf] format: - # The image passed here is in BGR format with changed width and height. To display it in colors expected by matplotlib, use cvtColor function - def convert_result_to_image(bgr_image, resized_image, boxes, threshold=0.3, conf_labels=True): - # Define colors for boxes and descriptions. - colors = {"red": (255, 0, 0), "green": (0, 255, 0)} - - # Fetch the image shapes to calculate a ratio. - (real_y, real_x), (resized_y, resized_x) = ( - bgr_image.shape[:2], - resized_image.shape[:2], - ) - ratio_x, ratio_y = real_x / resized_x, real_y / resized_y - - # Convert the base image from BGR to RGB format. - rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB) - - # Iterate through non-zero boxes. - for box in boxes: - # Pick a confidence factor from the last place in an array. - conf = box[-1] - if conf > threshold: - # Convert float to int and multiply corner position of each box by x and y ratio. - # If the bounding box is found at the top of the image, - # position the upper box bar little lower to make it visible on the image. - (x_min, y_min, x_max, y_max) = [ - (int(max(corner_position * ratio_y, 10)) if idx % 2 else int(corner_position * ratio_x)) for idx, corner_position in enumerate(box[:-1]) - ] - - # Draw a box based on the position, parameters in rectangle function are: image, start_point, end_point, color, thickness. - rgb_image = cv2.rectangle(rgb_image, (x_min, y_min), (x_max, y_max), colors["green"], 3) - - # Add text to the image based on position and confidence. - # Parameters in text function are: image, text, bottom-left_corner_textfield, font, font_scale, color, thickness, line_type. - if conf_labels: - rgb_image = cv2.putText( - rgb_image, - f"{conf:.2f}", - (x_min, y_min - 10), - cv2.FONT_HERSHEY_SIMPLEX, - 0.8, - colors["red"], - 1, - cv2.LINE_AA, - ) - - return rgb_image - -.. code:: ipython3 - - plt.figure(figsize=(10, 6)) - plt.axis("off") - plt.imshow(convert_result_to_image(image, resized_image, boxes, conf_labels=False)) - - - - -.. parsed-literal:: - - - - - - -.. image:: model-server-with-output_files/model-server-with-output_28_1.png - - -To stop and remove the model server container, you can use the following -command: - -.. code:: ipython3 - - !docker stop ovms - - -.. parsed-literal:: - - ovms - - -References ----------- - - - -1. `OpenVINO™ Model Server - documentation `__ -2. `OpenVINO™ Model Server GitHub - repository `__ diff --git a/docs/notebooks/model-server-with-output_files/model-server-with-output_23_2.png b/docs/notebooks/model-server-with-output_files/model-server-with-output_23_2.png deleted file mode 100644 index 405f2240ea5aec..00000000000000 --- a/docs/notebooks/model-server-with-output_files/model-server-with-output_23_2.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:151f6c5d9e90a68411adce7359011fe6580a071decbc7f1cacbf60975083bc9d -size 305482 diff --git a/docs/notebooks/model-server-with-output_files/model-server-with-output_28_1.png b/docs/notebooks/model-server-with-output_files/model-server-with-output_28_1.png deleted file mode 100644 index 74a53ee7ee0da3..00000000000000 --- a/docs/notebooks/model-server-with-output_files/model-server-with-output_28_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c0dd02ecae6935c5d65ae1ad543ef6a918185c440b667e68b98f4bd681762030 -size 458294 diff --git a/docs/notebooks/modelscope-to-openvino-with-output.rst b/docs/notebooks/modelscope-to-openvino-with-output.rst deleted file mode 100644 index 6bdf0acb13012a..00000000000000 --- a/docs/notebooks/modelscope-to-openvino-with-output.rst +++ /dev/null @@ -1,565 +0,0 @@ -Convert models from ModelScope to OpenVINO -========================================== - -.. image:: https://camo.githubusercontent.com/bbda58b4f77b80d9206e3410b533ca5a2582b81070e7dd283ee12fd0d442bd2b/68747470733a2f2f6d6f64656c73636f70652e6f73732d636e2d6265696a696e672e616c6979756e63732e636f6d2f6d6f64656c73636f70652e676966 - -`ModelScope `__ is a -“Model-as-a-Service” (MaaS) platform that seeks to bring together most -advanced machine learning models from the AI community, and to -streamline the process of leveraging AI models in real applications. -Hundreds of models are made publicly available on ModelScope (700+ and -counting), covering the latest development in areas such as NLP, CV, -Audio, Multi-modality, and AI for Science, etc. Many of these models -represent the SOTA in their specific fields, and made their open-sourced -debut on ModelScope. - -This tutorial covers how to use the modelscope ecosystem within -OpenVINO. - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Convert models from ModelScope using OpenVINO Model Conversion - API <#convert-models-from-modelscope-using-openvino-model-conversion-api>`__ - - - `Select inference device for image - classification <#select-inference-device-for-image-classification>`__ - - `Run Image classification <#run-image-classification>`__ - -- `Convert ModelScope models using Optimum - Intel <#convert-modelscope-models-using-optimum-intel>`__ - - - `Select inference device for text - classification <#select-inference-device-for-text-classification>`__ - - `Perform text classification <#perform-text-classification>`__ - -- `Convert ModelScope models for usage with OpenVINO - GenAI <#convert-modelscope-models-for-usage-with-openvino-genai>`__ - - - `Select inference device for text - generation <#select-inference-device-for-text-generation>`__ - - `Run OpenVINO GenAI pipeline <#run-openvino-genai-pipeline>`__ - -Prerequisites -------------- - - - -.. code:: ipython3 - - import platform - - %pip install -q "torch>=2.1.1" "torchvision" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q modelscope addict oss2 simplejson sortedcontainers pillow opencv-python "datasets<=3.0.0" - %pip install -q "transformers>=4.45" "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -qU "openvino>=2024.5.0" "openvino-tokenizers>=2024.5.0" "openvino-genai>=2024.5.0" "nncf>=2.14.0" - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("modelscope-to-openvino.ipynb") - -Convert models from ModelScope using OpenVINO Model Conversion API ------------------------------------------------------------------- - - - -Modelscope package provides API for initializing a model and loading a -set of pre-trained weights using the model text handle. Discovering a -desired model name is straightforward with `Modelscope models web -page `__, one can choose a model -solving a particular machine learning problem and even sort the models -by popularity and novelty. - -OpenVINO supports various types of models and frameworks via conversion -to OpenVINO Intermediate Representation (IR). `OpenVINO model conversion -API `__ -should be used for these purposes. ``ov.convert_model`` function accepts -original model instance and example input for tracing and returns -``ov.Model`` representing this model in OpenVINO framework. Converted -model can be used for saving on disk using ``ov.save_model`` function or -directly loading on device using ``core.complie_model``. - -As example, we will use -`tinynas `__ -image classification model. The code bellow demonstrates how to load -this model using Modelscope pipelines interface, convert it to OpenVINO -IR and then perform image classification on specified device. - -.. code:: ipython3 - - from pathlib import Path - - from modelscope.pipelines import pipeline - from modelscope.utils.constant import Tasks - import openvino as ov - import torch - import gc - - - cls_model_id = "iic/cv_tinynas_classification" - cls_model_path = Path(cls_model_id.split("/")[-1]) / "openvino_model.xml" - - if not cls_model_path.exists(): - # load Modelcope pipeline with model - image_classification = pipeline(Tasks.image_classification, model=cls_model_id) - # convert model to OpenVINO - ov_model = ov.convert_model(image_classification.model, example_input=torch.zeros((1, 3, 224, 224)), input=[1, 3, 224, 224]) - # save OpenVINO model on disk for next usage - ov.save_model(ov_model, cls_model_path) - del ov_model - del image_classification - gc.collect(); - - -.. parsed-literal:: - - 2024-11-12 19:08:10.199148: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-12 19:08:10.212253: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered - WARNING: All log messages before absl::InitializeLog() is called are written to STDERR - E0000 00:00:1731424090.226654 1605757 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered - E0000 00:00:1731424090.230976 1605757 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered - 2024-11-12 19:08:10.246563: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - - -Select inference device for image classification -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - cv_cls_device = device_widget("CPU") - - cv_cls_device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -Run Image classification -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Model inference interface remains compatible with pipeline preprocessing -and postprocessing, so you can reuse these part of pipeline, but for -providing standalone experience, we will demonstrate how to use model -without pipeline. The code bellow defines utilities for image -preprocessing and postprocessing. - -.. code:: ipython3 - - from notebook_utils import download_file - from PIL import Image - from torchvision import transforms - - # prepare input data and output lables - img_url = "https://pailitao-image-recog.oss-cn-zhangjiakou.aliyuncs.com/mufan/img_data/maas_test_data/dog.png" - img_path = Path("dog.png") - - labels_url = "https://raw.githubusercontent.com/openvinotoolkit/open_model_zoo/master/data/dataset_classes/imagenet_2012.txt" - - labels_path = Path("imagenet_2012.txt") - - if not img_path.exists(): - download_file(img_url) - - if not labels_path.exists(): - download_file(labels_url) - - image = Image.open(img_path) - imagenet_classes = labels_path.open("r").read().splitlines() - - - # prepare image preprocessing - transforms_normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - transform_list = [ - transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms_normalize, - ] - transformer = transforms.Compose(transform_list) - - # compile model - core = ov.Core() - - ov_model = core.compile_model(cls_model_path, cv_cls_device.value) - -Now, when we make all necessary preparations, we can run model -inference. - -.. code:: ipython3 - - import numpy as np - - # preprocess input - image_tensor = transformer(image) - - # run model inference - result = ov_model(image_tensor.unsqueeze(0))[0] - - # postprocess results - label_id = np.argmax(result[0]) - score = result[0][label_id] - - label = imagenet_classes[label_id] - - # visualize results - display(image) - print(f"Predicted label: {label}, score {score}") - - - -.. image:: modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.png - - -.. parsed-literal:: - - Predicted label: n02099601 golden retriever, score 8.060977935791016 - - -Convert ModelScope models using Optimum Intel ---------------------------------------------- - - - -For models compatible with the `HuggingFace -Transformers `__ -library, we can use `Optimum -Intel `__ integration -to convert and run model. Optimum Intel is the interface between the -Transformers and Diffusers libraries and the different tools and -libraries provided by Intel to accelerate end-to-end pipelines on Intel -architectures. - -Optimum Intel provides a simple interface for optimizing your -Transformers and Diffusers models, converting them to the OpenVINO -Intermediate Representation (IR) format, and running inference using -OpenVINO Runtime, among other use cases. For running ModelScope models -using this interface we should download model from hub first. There are -several ways how to download models from Modelscope Hub, one of them is -usage of ``modelscope.snapshot_download`` function. This function -accepts model id from hub and optionally local directory (if not -provided, model will be downloaded to cache directory). - -After that, we can load model to Optimum Intel interface replacing the -``AutoModelForXxx`` class from transformers with the corresponding -``OVModelForXxx``. Model conversion will be performed on the fly. For -avoiding next time conversion, we can save model on disk using -``save_pretrained`` method and in the next time pass directory with -already converted model as argument in ``from_pretrained`` method. We -also specified ``device`` parameter for compiling the model on the -specific device, if not provided, the default device will be used. The -device can be changed later in runtime using ``model.to(device)``, -please note that it may require some time for model compilation on a -newly selected device. In some cases, it can be useful to separate model -initialization and compilation, for example, if you want to reshape the -model using ``reshape`` method, you can postpone compilation, providing -the parameter ``compile=False`` into ``from_pretrained`` method, -compilation can be performed manually using ``compile`` method or will -be performed automatically during first inference run. - -As example, we will use -`nlp_bert_sentiment-analysis_english-base `__. -This model was trained for classification input text on 3 sentiment -categories: negative, positive and neutral. In transformers, -``AutoModelForSequenceClassification`` should be used for model -initialization, so for usage model with OpenVINO, it is enough just -replace ``AutoModelForSequenceClassification`` to -``OVModelForSequenceClassification``. - -.. code:: ipython3 - - from modelscope import snapshot_download - - text_model_id = "iic/nlp_bert_sentiment-analysis_english-base" - text_model_path = Path(text_model_id.split("/")[-1]) - ov_text_model_path = text_model_path / "ov" - - - if not text_model_path.exists(): - snapshot_download(text_model_id, local_dir=text_model_path) - -Select inference device for text classification -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - text_cls_device = device_widget("CPU", "NPU") - - text_cls_device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -Perform text classification -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from transformers import AutoTokenizer - from optimum.intel.openvino import OVModelForSequenceClassification - - - tokenizer = AutoTokenizer.from_pretrained(text_model_path) - - if not ov_text_model_path.exists(): - # model will be automatically exported to OpenVINO format during loading - ov_model = OVModelForSequenceClassification.from_pretrained(text_model_path, text_cls_device.value) - ov_model.save_pretrained(ov_text_model_path) - # save converted model using save_pretrained for avoid conversion in next time - tokenizer.save_pretrained(ov_text_model_path) - else: - # load converted model directly if availa ble - ov_model = OVModelForSequenceClassification.from_pretrained(ov_text_model_path, device=text_cls_device.value) - - # prepare input - input_text = "Good night." - input_data = tokenizer(input_text, return_tensors="pt") - - # run model inference - output = ov_model(**input_data) - # postprocess results - predicted_label_id = output.logits[0].argmax().item() - - predicted_label = ov_model.config.id2label[predicted_label_id] - - print(f"predicted label: {predicted_label}") - - -.. parsed-literal:: - - predicted label: Positive - - -Convert ModelScope models for usage with OpenVINO GenAI -------------------------------------------------------- - - - -OpenVINO™ GenAI is a library of the most popular Generative AI model -pipelines, optimized execution methods, and samples that run on top of -highly performant `OpenVINO -Runtime `__. - -This library is friendly to PC and laptop execution, and optimized for -resource consumption. It requires no external dependencies to run -generative models as it already includes all the core functionality -(e.g. tokenization via openvino-tokenizers). - -You can also load and run models from ModelScope with OpenVINO GenAI -`supported -pipelines `__. - -This inference approach is also based on model representation obtained -using Optimum Intel and also requires to download ModelScope model -first. As example we will be -`qwen2.5-1.5b-instruct `__ -model for text generation, that is part of powerful Qwen2 LLMs family. -If in previous chapter we are focused with usage python API for -downloading and converting models, in this one - we are also considering -CLI usage for the same actions. - -Downloading ModelScope models using CLI can be performed using following -command: - -.. code:: bash - - modelscope download --local_dir - -where ```` is model id from Hub and ```` is -output directory for model saving. - -``optimum-cli`` provides command line interface for exporting models -using Optimum. General OpenVINO export command format: - -.. code:: bash - - optimum-cli export openvino --model --task - -where task is task to export the model for. Available tasks depend on -the model, but are among: [‘default’, ‘fill-mask’, ‘text-generation’, -‘text2text-generation’, ‘text-classification’, ‘token-classification’, -‘multiple-choice’, ‘object-detection’, ‘question-answering’, -‘image-classification’, ‘image-segmentation’, ‘masked-im’, -‘semantic-segmentation’, ‘automatic-speech-recognition’, -‘audio-classification’, ‘audio-frame-classification’, -‘automatic-speech-recognition’, ‘audio-xvector’, ‘image-to-text’, -‘stable-diffusion’, ‘zero-shot-object-detection’]. - -You can find a mapping between tasks and model classes in Optimum -TaskManager -`documentation `__. - -Additionally, you can specify weights compression using -``--weight-format`` argument with one of following options: ``fp32``, -``fp16``, ``int8`` and ``int4``. For int8 and int4 nncf will be used for -weight compression. For models that required remote code execution, -``--trust-remote-code`` flag should be provided. - -Full list of supported arguments available via ``--help`` - -.. code:: ipython3 - - from IPython.display import Markdown, display - - model_id = "Qwen/Qwen2.5-1.5B-Instruct" - - llm_path = Path("Qwen2.5-1.5B-Instruct") - ov_llm_path = llm_path / "ov" - download_command = f"modelscope download {model_id} --local_dir {llm_path}" - display(Markdown("**Download command:**")) - display(Markdown(f"`{download_command}`")) - - if not llm_path.exists(): - !{download_command} - - - -**Download command:** - - - -``modelscope download Qwen/Qwen2.5-1.5B-Instruct --local_dir Qwen2.5-1.5B-Instruct`` - - -.. code:: ipython3 - - export_command = f"optimum-cli export openvino -m {llm_path} --task text-generation-with-past --weight-format int4 {ov_llm_path}" - display(Markdown("**Export command:**")) - display(Markdown(f"`{export_command}`")) - - if not ov_llm_path.exists(): - !{export_command} - - - -**Export command:** - - - -``optimum-cli export openvino -m Qwen2.5-1.5B-Instruct --task text-generation-with-past --weight-format int4 Qwen2.5-1.5B-Instruct/ov`` - - -Select inference device for text generation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - llm_device = device_widget("CPU") - - llm_device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -Run OpenVINO GenAI pipeline -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -For running text generation using OpenVINO GenAI, we should use -``LLMPipeline`` class initialized with providing converted model -directory and inference device. You can find more detailed example how -to use OpenVINO GenAI ``LLMPipeline`` for chatbot scenario in this -`tutorial `__. - -.. code:: ipython3 - - import openvino_genai as ov_genai - - - def streamer(subword): - print(subword, end="", flush=True) - # Return flag corresponds whether generation should be stopped. - # False means continue generation. - return False - - - llm_pipe = ov_genai.LLMPipeline(ov_llm_path, llm_device.value) - - llm_pipe.generate("The Sun is yellow because", max_new_tokens=200, streamer=streamer) - - -.. parsed-literal:: - - it has a spectrum of colors, and you are also looking at it. What color would the sun be if you could see its light without being able to see any other objects? If we imagine that someone had never seen or heard about the sun before, what would they expect to see? - - 1. **Color of the Sun**: The sun appears yellow when viewed from Earth due to the way our atmosphere scatters sunlight. This phenomenon occurs as follows: - - - **Sunlight Scattering**: When sunlight passes through the Earth's atmosphere, different wavelengths (colors) of light travel at slightly different speeds due to their varying energies. - - **Air Mass Height**: At higher altitudes where air density decreases with altitude, shorter wavelength (blue) photons have more energy and thus escape faster into space compared to longer wavelength (red) photons which remain in the atmosphere longer. - - **Sky Color**: As a result, blue light is scattered more than red light by molecules in the upper layers of the atmosphere - - - -.. parsed-literal:: - - " it has a spectrum of colors, and you are also looking at it. What color would the sun be if you could see its light without being able to see any other objects? If we imagine that someone had never seen or heard about the sun before, what would they expect to see?\n\n1. **Color of the Sun**: The sun appears yellow when viewed from Earth due to the way our atmosphere scatters sunlight. This phenomenon occurs as follows:\n\n - **Sunlight Scattering**: When sunlight passes through the Earth's atmosphere, different wavelengths (colors) of light travel at slightly different speeds due to their varying energies.\n - **Air Mass Height**: At higher altitudes where air density decreases with altitude, shorter wavelength (blue) photons have more energy and thus escape faster into space compared to longer wavelength (red) photons which remain in the atmosphere longer.\n - **Sky Color**: As a result, blue light is scattered more than red light by molecules in the upper layers of the atmosphere" - - - -.. code:: ipython3 - - import gc - - del llm_pipe - gc.collect(); diff --git a/docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.jpg b/docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.jpg deleted file mode 100644 index 97ae56df8a8721..00000000000000 --- a/docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1745fd9f64ac9914621f7eee3668e86daa8121bc83d1a2c7f27963c85026f104 -size 66633 diff --git a/docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.png b/docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.png deleted file mode 100644 index d1c0d309736c1a..00000000000000 --- a/docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6235ab7dd2cb4318435320004320ffc6de773044c51cadcd581a7996faca313a -size 636558 diff --git a/docs/notebooks/multilora-image-generation-with-output.rst b/docs/notebooks/multilora-image-generation-with-output.rst deleted file mode 100644 index 479df7d5d84db9..00000000000000 --- a/docs/notebooks/multilora-image-generation-with-output.rst +++ /dev/null @@ -1,473 +0,0 @@ -Multi LoRA Image Generation -=========================== - -LoRA, or `Low-Rank Adaptation `__, is -a popular and lightweight training technique used for fine-tuning Large -Language and Stable Diffusion Models without needing full model -training. Full fine-tuning of larger models (consisting of billions of -parameters) is inherently expensive and time-consuming. LoRA works by -adding a smaller number of new weights to the model for training, rather -than retraining the entire parameter space of the model. This makes -training with LoRA much faster, memory-efficient, and produces smaller -model weights (a few hundred MBs), which are easier to store and share. - -At its core, LoRA leverages the concept of low-rank matrix -factorization. Instead of updating all the parameters in a neural -network, LoRA decomposes the parameter space into two low-rank matrices. -This decomposition allows the model to capture essential information -with fewer parameters, significantly reducing the amount of data and -computation required for fine-tuning. - -|image0| - -By incorporating LoRA into Stable Diffusion models, we can enhance their -ability to understand complex relationships and patterns in data. This -approach opens up numerous possibilities: \* **Art and Design**: Artists -can fine-tune models to generate images that align with their unique -styles, creating personalized artwork effortlessly. \* **Content -Creation**: Businesses can customize image generation models to produce -branded visuals, enhancing marketing and media production. \* -**Entertainment**: Game developers and filmmakers can use fine-tuned -models to create realistic and imaginative worlds, streamlining the -creative process. - -In this tutorial we explore possibilities to use LoRA with OpenVINO -Generative API. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Convert Diffusion Model using Optimum - Intel <#convert-diffusion-model-using-optimum-intel>`__ - - - `Applying LoRA to Original Diffusers pipeline before - conversion <#applying-lora-to-original-diffusers-pipeline-before-conversion>`__ - -- `Image Generation using OpenVINO - GenAI <#image-generation-using-openvino-genai>`__ - - - `Integration LoRA into - pipeline <#integration-lora-into-pipeline>`__ - - `Prepare LoRA Adapters <#prepare-lora-adapters>`__ - - `Create Inference Pipeline <#create-inference-pipeline>`__ - - `Selection specific adapter during - generation <#selection-specific-adapter-during-generation>`__ - - `Use multiple adapters - simultaneously <#use-multiple-adapters-simultaneously>`__ - - `Disable adapters <#disable-adapters>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. |image0| image:: https://github.com/user-attachments/assets/bf823c71-13b4-402c-a7b4-d6fc30a60d88 - -.. code:: ipython3 - - import platform - - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu torch torchvision transformers accelerate "diffusers>0.25.0" pillow "gradio>=4.19" "peft>=0.7.0" - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" - %pip install -q -U "openvino>=2024.5.0" "openvino-tokenizers>=2024.5.0" "openvino-genai>=2024.5.0" - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - -.. code:: ipython3 - - import requests - from pathlib import Path - - notebook_utils_path = Path("notebook_utils.py") - lora_config_path = Path("lora_config.py") - - if not notebook_utils_path.exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - notebook_utils_path.open("w").write(r.text) - - if not lora_config_path.exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/multilora-image-generation/lora_config.py", - ) - lora_config_path.open("w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("multilora-image-generation.ipynb") - -Convert Diffusion Model using Optimum Intel -------------------------------------------- - - - -`Optimum Intel `__ is -the interface between the -`Transformers `__ and -`Diffusers `__ libraries -and OpenVINO to accelerate end-to-end pipelines on Intel architectures. -It provides ease-to-use -`interface `__ -for exporting models to `OpenVINO Intermediate Representation -(IR) `__ -format. - -Applying LoRA to Original Diffusers pipeline before conversion -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -LoRA can be easily added to `Diffusers -pipeline `__ -before export. At the export stage, LoRA weights will be fused to -original model weights and converted model will preserve LoRA provided -behavior. This approach is suitable when you need model with adapter -capabilities by default and it does not required configuration at -inference time (e.g. changing weight coefficient for adapter). For -example, we can use this method for speedup generation process with -integration `LCM LoRA `__. -Previously, we already considered with approach in this -`tutorial `__. - -Using ``optimum-cli`` for exporting models requires to provide model id -on HuggingFace Hub or local directory with saved model. In case, if -model stored in multiple separated repositories or directories (e.g. you -want to replace VAE component or add LoRA), it should be merged and -saved on disk before export. For avoiding this, we will use -``export_from_model`` function that accepts initialized model. -Additionally, for using model with OpenVINO GenAI, we need to export -tokenizers to OpenVINO format using `OpenVINO -Tokenizers `__ -library. - -In this tutorial we will use `Stable Diffusion -XL `__ -model, but the same steps are also applicable to other models of Stable -Diffusion family. - -.. code:: ipython3 - - from pathlib import Path - from diffusers import DiffusionPipeline, AutoencoderKL, LCMScheduler - from optimum.exporters.openvino import export_from_model - from optimum.intel.openvino import OVConfig - from optimum.exporters.openvino.convert import export_tokenizer - import gc - - model_dir = Path("sdxl-lcm") - - if not model_dir.exists(): - model_id = "stabilityai/stable-diffusion-xl-base-1.0" - adapter_id = "latent-consistency/lcm-lora-sdxl" - vae_id = "madebyollin/sdxl-vae-fp16-fix" - vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix") - pipe = DiffusionPipeline.from_pretrained(model_id, vae=vae, variant="fp16", use_safetensors=True) - pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) - pipe.load_lora_weights(adapter_id) - pipe.fuse_lora() - export_from_model(pipe, model_dir, task="text-to-image", stateful=False, ov_config=OVConfig(dtype="fp16")) - for tokenizer in ["tokenizer", "tokenizer_2"]: - tokenizer_model = getattr(pipe, tokenizer, None) - if tokenizer_model is not None: - export_tokenizer(tokenizer_model, model_dir / tokenizer, task="text-to-image") - del vae - del pipe - gc.collect() - - -.. parsed-literal:: - - 2024-11-08 16:49:48.963221: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-08 16:49:48.977712: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered - WARNING: All log messages before absl::InitializeLog() is called are written to STDERR - E0000 00:00:1731070188.992824 718925 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered - E0000 00:00:1731070188.997386 718925 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered - 2024-11-08 16:49:49.014687: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - - -Image Generation using OpenVINO GenAI -------------------------------------- - - - -`OpenVINO™ GenAI `__ -is a library of the most popular Generative AI model pipelines, -optimized execution methods, and samples that run on top of highly -performant `OpenVINO -Runtime `__. - -This library is friendly to PC and laptop execution, and optimized for -resource consumption. It requires no external dependencies to run -generative models as it already includes all the core functionality. - -``openvino_genai.Text2ImagePipeline`` class supports inference of -`Diffusers -models `__. -For pipeline initialization, we should provide directory with converted -by Optimum Intel pipeline and specify inference device. Optionally, we -can provide configuration for LoRA Adapters using ``adapter_config``. -For starting generation process ``generate`` method should be used. -Basically, it required to provide input text prompt for image -generation. You can provide additional arguments like negative prompt, -number of steps, guidance scale, image width and height to control -generation process. - -Integration LoRA into pipeline -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Similarly to Diffusers pipeline, you can store separately and load LoRA -into base pipeline before inference using OpenVINO GenAI. -``openvino_genai.AdapterConfig`` serves for adapters management in -``openvino_genai.Text2ImagePipeline``. It can be used for adding and -removing adapters or changing their weight coefficient for blending into -pipeline. You can add one or multiple adapters into config and also -specify alpha blending coefficients for their addition. OpenVINO GenAI -supports LoRA adapters saved in Safetensors format. You can use one of -publicly available pretrained adapters from -`CivitAI `__ or `HuggingFace -Hub `__ or train your own. > **Important -Note**: Before loading pretrained adapters, please make sure that they -are compatible with your base model architecture. E.g. if you use SDXL -model, you need to provide adapters trained for this model type and -loading adapter, for example, trained for FLUX is not allowed. - -Generally, process of adapters configuration consists of 2 steps: 1. -Register adapters in pipeline constructor. At this moment, it is -recommended to provide all adapters that you plan to use on this stage. -2. Choose which adapter (or a combination of adapters) to apply in each -``generate`` call. It is not obligated to use all of provided in -constructor adapters simultaneously, you can select one or combination -of several among them for each generation cycle. - -Prepare LoRA Adapters -~~~~~~~~~~~~~~~~~~~~~ - - - -.. _prepare-lora-adapters-1: - -Prepare LoRA Adapters -~~~~~~~~~~~~~~~~~~~~~ - -.. code:: ipython3 - - from lora_config import LORA - - # uncomment this line to see predefined LoRA adapters configuration used in this notebook - # LORA - -.. code:: ipython3 - - from huggingface_hub import hf_hub_download - - lora_dir = Path("lora") - adapter_paths = [] - - for lora in LORA: - lora_model_dir = lora_dir / lora["name"].lower().replace(" ", "_") - file_name = lora["file_name"] - if not (lora_model_dir / file_name).exists(): - hf_hub_download(repo_id=lora["model_id"], filename=file_name, local_dir=lora_model_dir) - adapter_paths.append(lora_model_dir / file_name) - -.. code:: ipython3 - - import openvino_genai as ov_genai - - - def prepare_adapter_config(scales=None): - if scales is None: - scales = [1 / len(adapter_paths)] * len(adapter_paths) - if isinstance(scales, float): - scales = [scales] * len(adapter_paths) - adapter_config = ov_genai.AdapterConfig() - for adapter, scale in zip(adapter_paths, scales): - adapter_config.add(ov_genai.Adapter(adapter), scale) - - return adapter_config - - - adapters_config = prepare_adapter_config(0.0) - adapters = adapters_config.get_adapters() - -Create Inference Pipeline -~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -diffusion process involves random for preparing initial state for -denoising. For reproducibility of generation results, we will use -``Generator`` class. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget(default="CPU", exclude=["NPU"]) - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -.. code:: ipython3 - - import openvino as ov - import torch - - - class Generator(ov_genai.Generator): - def __init__(self, seed): - ov_genai.Generator.__init__(self) - self.generator = torch.Generator(device="cpu").manual_seed(seed) - - def next(self): - return torch.randn(1, generator=self.generator, dtype=torch.float32).item() - - def randn_tensor(self, shape: ov.Shape): - torch_tensor = torch.randn(list(shape), generator=self.generator, dtype=torch.float32) - return ov.Tensor(torch_tensor.numpy()) - - - pipe = ov_genai.Text2ImagePipeline(model_dir, "CPU", adapters=adapters_config) - -Selection specific adapter during generation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -As it was already mention before, it is not necessary to use all -adapters specified at initialization stage for generation in the same -time. Providing adapters argument with ``openvino_genai.AdapterConfig`` -into ``generate`` allow to select one or several from them. For example, -let’s select LoRA for generation images in X-Ray style. - -.. code:: ipython3 - - subject = "a cute cat in sunglasses" - prompt_template = LORA[0].get("prompt", "") - adapter_weight = LORA[0].get("weight", 1.0) - prompt = prompt_template.replace("", subject) - adapter_config = ov_genai.AdapterConfig() - adapter_config.add(adapters[0], adapter_weight) - image_tensor = pipe.generate(prompt, num_inference_steps=4, guidance_scale=0, adapters=adapter_config, generator=Generator(421235)) - -.. code:: ipython3 - - from PIL import Image - - image = Image.fromarray(image_tensor.data[0]) - image - - - - -.. image:: multilora-image-generation-with-output_files/multilora-image-generation-with-output_15_0.png - - - -Use multiple adapters simultaneously -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -You also can use combination of adapters that will be applied in the -same time. Let’s see what happens if traditional Japanese art will meet -modern illustration pointillistic style. - -.. code:: ipython3 - - prompt_template1 = LORA[1].get("prompt", "") - prompt_template2 = LORA[2].get("prompt", "") - adapter1_weight = LORA[1].get("weight", 1.0) - adapter2_weight = LORA[2].get("weight", 1.0) - - prompt = prompt_template2.replace("", prompt_template1.replace("", subject)) - adapter_config = ov_genai.AdapterConfig() - adapter_config.add(adapters[1], adapter1_weight) - adapter_config.add(adapters[2], adapter2_weight) - image_tensor = pipe.generate(prompt, num_inference_steps=4, guidance_scale=0, adapters=adapter_config, generator=Generator(421235)) - -.. code:: ipython3 - - image = Image.fromarray(image_tensor.data[0]) - image - - - - -.. image:: multilora-image-generation-with-output_files/multilora-image-generation-with-output_18_0.png - - - -Disable adapters -~~~~~~~~~~~~~~~~ - - - -You can disable adapters providing empty ``AdapterConfig`` into generate - -.. code:: ipython3 - - image_tensor = pipe.generate(subject, num_inference_steps=4, guidance_scale=0, adapters=ov_genai.AdapterConfig(), generator=Generator(421235)) - -.. code:: ipython3 - - image = Image.fromarray(image_tensor.data[0]) - image - - - - -.. image:: multilora-image-generation-with-output_files/multilora-image-generation-with-output_21_0.png - - - -Interactive demo ----------------- - - - -.. code:: ipython3 - - gradio_helper_path = Path("gradio_helper.py") - - if not gradio_helper_path.exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/multilora-image-generation/gradio_helper.py", - ) - lora_config_path.open("w").write(r.text) - -.. code:: ipython3 - - from gradio_helper import make_demo - - demo = make_demo(pipe, Generator, adapters, LORA) - - try: - demo.launch(debug=False) - except Exception: - demo.launch(share=True, debug=False) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_15_0.jpg b/docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_15_0.jpg deleted file mode 100644 index 1427e6afb594ac..00000000000000 --- a/docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_15_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:378fce8c53832fa402e94c50995aa5f188d16a6a6886c08fe4f8323bcf7daabe -size 42135 diff --git a/docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_15_0.png b/docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_15_0.png deleted file mode 100644 index 873721f87cc2a3..00000000000000 --- a/docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_15_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:27ac6d45499eb6e67ddf78f8f3493fd3e9dc3885cec2b4fda8067f9b1f7a9ebf -size 1252162 diff --git a/docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_18_0.jpg b/docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_18_0.jpg deleted file mode 100644 index 1b6a88d2cde069..00000000000000 --- a/docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_18_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:de879c60657ad9c471ccc971d63cc2ac25be5b477c6ebcd8b2e1a2a438b2f3c1 -size 146062 diff --git a/docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_18_0.png b/docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_18_0.png deleted file mode 100644 index 9b26d20ef04ab8..00000000000000 --- a/docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_18_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ae8cec0bac904c1868d7786121978b2ca819ead5c8b02cf09bb07f75b927a3a1 -size 1940316 diff --git a/docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_21_0.jpg b/docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_21_0.jpg deleted file mode 100644 index 199be9b483e18f..00000000000000 --- a/docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_21_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:564848925f540cf500457a4996631ba616cc6547b63d377ce22ac8c3e9431c04 -size 87425 diff --git a/docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_21_0.png b/docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_21_0.png deleted file mode 100644 index bbf4eaaf030a42..00000000000000 --- a/docs/notebooks/multilora-image-generation-with-output_files/multilora-image-generation-with-output_21_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b573ab59972699e762f8a52c0ce17a0db060230effe78e2ae3408290a9173103 -size 1417021 diff --git a/docs/notebooks/multimodal-rag-llamaindex-with-output.rst b/docs/notebooks/multimodal-rag-llamaindex-with-output.rst deleted file mode 100644 index e6dd7330e74f49..00000000000000 --- a/docs/notebooks/multimodal-rag-llamaindex-with-output.rst +++ /dev/null @@ -1,840 +0,0 @@ -Multimodal RAG for video analytics with LlamaIndex -================================================== - -Constructing a RAG pipeline for text is relatively straightforward, -thanks to the tools developed for parsing, indexing, and retrieving text -data. However, adapting RAG models for video content presents a greater -challenge. Videos combine visual, auditory, and textual elements, -requiring more processing power and sophisticated video pipelines. - -To build a truly multimodal search for videos, you need to work with -different modalities of a video like spoken content, visual. In this -notebook, we showcase a Multimodal RAG pipeline designed for video -analytics. It utilizes Whisper model to convert spoken content to text, -CLIP model to generate multimodal embeddings, and Vision Language model -(VLM) to process retrieved images and text messages. The following -picture illustrates how this pipeline is working. - -.. figure:: https://github.com/user-attachments/assets/baef4914-5c07-432c-9363-1a0cb5944b09 - :alt: Multimodal RAG - - Multimodal RAG - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Convert and Compress models <#convert-and-compress-models>`__ - - - `ASR model <#asr-model>`__ - - `CLIP model <#clip-model>`__ - - `VLM model <#vlm-model>`__ - -- `Download and process video <#download-and-process-video>`__ - - - `Initialize ASR <#initialize-asr>`__ - -- `Create the multi-modal index <#create-the-multi-modal-index>`__ - - - `Initialize CLIP <#initialize-clip>`__ - -- `Search text and image - embeddings <#search-text-and-image-embeddings>`__ -- `Generate final response using - VLM <#generate-final-response-using-vlm>`__ - - - `Set the RAG prompt template <#set-the-rag-prompt-template>`__ - - `Initialize VLM <#initialize-vlm>`__ - -- `Interactive Demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -install required packages and setup helper functions. - -.. code:: ipython3 - - %pip uninstall -q -y "moviepy" "decorator" - - %pip install -q "llama-index-core" "llama-index-embeddings-openvino>=0.4.1" "llama-index-multi-modal-llms-openvino" "llama-index-readers-file" \ - "llama-index-vector-stores-qdrant" \ - "transformers>=4.45" \ - "moviepy>=2.1.1" \ - "librosa" \ - "python-ffmpeg<=1.0.16" \ - "open_clip_torch" \ - "huggingface_hub" \ - "gradio>=4.44.1" --extra-index-url https://download.pytorch.org/whl/cpu - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - - -.. code:: ipython3 - - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" "openvino>=2024.5.0" - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - - -.. code:: ipython3 - - import os - import requests - from pathlib import Path - - os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - - if not Path("notebook_utils.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("notebook_utils.py", "w", encoding="utf-8").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py") - open("cmd_helper.py", "w", encoding="utf-8").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("multimodal-rag-llamaindex.ipynb") - -Convert and Compress models ---------------------------- - - - -ASR model -~~~~~~~~~ - - - -In this example, we utilize -`Distil-Whisper `__ -to recognize the spoken content in video and generate text. -Distil-Whisper is a distilled variant of the -`Whisper `__ model by -OpenAI. The Distil-Whisper is proposed in the paper `Robust Knowledge -Distillation via Large-Scale Pseudo -Labelling `__. According to authors, -compared to Whisper, Distil-Whisper runs in several times faster with -50% fewer parameters, while performing to within 1% word error rate -(WER) on out-of-distribution evaluation data. For more information about -Distil-Whisper, please refer `Distil-Whisper -notebook `__. - -.. code:: ipython3 - - import huggingface_hub as hf_hub - - asr_model_id = "OpenVINO/distil-whisper-large-v3-int8-ov" - asr_model_path = asr_model_id.split("/")[-1] - - if not Path(asr_model_path).exists(): - hf_hub.snapshot_download(asr_model_id, local_dir=asr_model_path) - -CLIP model -~~~~~~~~~~ - - - -In this example, CLIP model will help to generate the embedding vectors -for both text and images. CLIP (Contrastive Language-Image Pre-Training) -is a neural network trained on various (image, text) pairs. It can be -instructed in natural language to predict the most relevant text -snippet, given an image, without directly optimizing for the task. - -CLIP uses a `ViT `__ like transformer -to get visual features and a causal language model to get the text -features. The text and visual features are then projected into a latent -space with identical dimensions. The dot product between the projected -image and text features is then used as a similarity score. - -.. code:: ipython3 - - from cmd_helper import optimum_cli - - clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K" - clip_model_path = clip_model_id.split("/")[-1] - - if not Path(clip_model_path).exists(): - optimum_cli(clip_model_id, clip_model_path) - - - -**Export command:** - - - -``optimum-cli export openvino --model laion/CLIP-ViT-B-32-laion2B-s34B-b79K CLIP-ViT-B-32-laion2B-s34B-b79K`` - - -VLM model -~~~~~~~~~ - - - -Vision Language model (VLM) is used to generate final response regrading -the context of images and texts retrieved from vector DB. It can help to -understand the both language and image instructions to complete various -real-world tasks. In this example, we select -`Phi-3.5-Vision `__ -as VLM. - -The Phi-3-Vision is a lightweight, state-of-the-art open multimodal -model built upon datasets which include - synthetic data and filtered -publicly available websites - with a focus on very high-quality, -reasoning dense data both on text and vision. The model belongs to the -Phi-3 model family, and the multimodal version comes with 128K context -length (in tokens) it can support. The model underwent a rigorous -enhancement process, incorporating both supervised fine-tuning and -direct preference optimization to ensure precise instruction adherence -and robust safety measures. More details about model can be found in -`model blog -post `__, -`technical report `__, -`Phi-3-cookbook `__ - -.. code:: ipython3 - - vlm_model_id = "microsoft/Phi-3.5-vision-instruct" - vlm_model_path = Path(vlm_model_id.split("/")[-1]) / "FP16" - - if not vlm_model_path.exists(): - !optimum-cli export openvino --model {vlm_model_id} --weight-format fp16 {vlm_model_path} --trust-remote-code - -.. code:: ipython3 - - import shutil - import nncf - import openvino as ov - import gc - - core = ov.Core() - - compression_config = { - "mode": nncf.CompressWeightsMode.INT4_SYM, - "group_size": 64, - "ratio": 0.6, - } - - compressed_model_path = vlm_model_path.parent / "INT4" - if not compressed_model_path.exists(): - ov_model = core.read_model(vlm_model_path / "openvino_language_model.xml") - compressed_ov_model = nncf.compress_weights(ov_model, **compression_config) - ov.save_model( - compressed_ov_model, - compressed_model_path / "openvino_language_model.xml", - ) - del compressed_ov_model - del ov_model - gc.collect() - for file_name in vlm_model_path.glob("*"): - if file_name.name in [ - "openvino_language_model.xml", - "openvino_language_model.bin", - ]: - continue - shutil.copy(file_name, compressed_model_path) - -Download and process video --------------------------- - - - -To begin, download an example video from YouTube and extract the audio -and frame files from it. - -.. code:: ipython3 - - video_url = "https://github.com/user-attachments/assets/2e38afa5-ce3e-448c-ad1a-0e9471039ff1" - output_folder = "./mixed_data/" - output_audio_path = "./mixed_data/output_audio.wav" - filepath = "video_data/input_vid.mp4" - - example_path = Path(filepath) - example_path.parent.mkdir(parents=True, exist_ok=True) - Path(output_folder).mkdir(parents=True, exist_ok=True) - - if not example_path.exists(): - r = requests.get(video_url) - with example_path.open("wb") as f: - f.write(r.content) - -Initialize ASR -~~~~~~~~~~~~~~ - - - -Select inference device - -.. code:: ipython3 - - from notebook_utils import device_widget - - asr_device = device_widget(default="AUTO", exclude=["NPU"]) - - asr_device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -The Hugging Face Optimum API is a high-level API that enables us to -convert and quantize models from the Hugging Face Transformers library -to the OpenVINO™ IR format. For more details, refer to the `Hugging Face -Optimum -documentation `__. - -Optimum Intel can be used to load optimized models from the `Hugging -Face Hub `__ and -create pipelines to run an inference with OpenVINO Runtime using Hugging -Face APIs. The Optimum Inference models are API compatible with Hugging -Face Transformers models. This means we just need to replace the -``AutoModelForXxx`` class with the corresponding ``OVModelForXxx`` -class. - -.. code:: ipython3 - - from optimum.intel import OVModelForSpeechSeq2Seq - from transformers import AutoProcessor, pipeline - - asr_model = OVModelForSpeechSeq2Seq.from_pretrained(asr_model_path, device=asr_device.value) - asr_processor = AutoProcessor.from_pretrained(asr_model_path) - - pipe = pipeline("automatic-speech-recognition", model=asr_model, tokenizer=asr_processor.tokenizer, feature_extractor=asr_processor.feature_extractor) - -.. code:: ipython3 - - import librosa - from moviepy.video.io.VideoFileClip import VideoFileClip - - - def video_to_images(video_path, output_folder): - """ - Convert a video to a sequence of images and save them to the output folder. - - Params: - video_path (str): The path to the video file. - output_folder (str): The path to the folder to save the images to. - - """ - clip = VideoFileClip(video_path) - clip.write_images_sequence(os.path.join(output_folder, "frame%04d.png"), fps=0.1) - - - def video_to_audio(video_path, output_audio_path): - """ - Convert a video to audio and save it to the output path. - - Params: - video_path (str): The path to the video file. - output_audio_path (str): The path to save the audio to. - - """ - clip = VideoFileClip(video_path) - audio = clip.audio - audio.write_audiofile(output_audio_path) - - - def audio_to_text(audio_path): - """ - Convert audio to text using the SpeechRecognition library. - - Params: - audio_path (str): The path to the audio file. - - Returns: - test (str): The text recognized from the audio. - - """ - en_raw_speech, samplerate = librosa.load(audio_path, sr=16000) - result = pipe(en_raw_speech, return_timestamps=True) - - return result["text"] - -In this step, we will extract the images and audio from video, then -convert its audio into text. - -.. code:: ipython3 - - try: - video_to_images(filepath, output_folder) - video_to_audio(filepath, output_audio_path) - text_data = audio_to_text(output_audio_path) - - with open(output_folder + "output_text.txt", "w") as file: - file.write(text_data) - print("Text data saved to file") - file.close() - os.remove(output_audio_path) - - except Exception as e: - raise e - - -.. parsed-literal:: - - Moviepy - Writing frames ./mixed_data/frame%04d.png. - - - - - - -.. parsed-literal:: - - Moviepy - Done writing frames ./mixed_data/frame%04d.png. - MoviePy - Writing audio in ./mixed_data/output_audio.wav - - - - - - -.. parsed-literal:: - - MoviePy - Done. - - -.. parsed-literal:: - - You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe. - - -.. parsed-literal:: - - Text data saved to file - - -Create the multi-modal index ----------------------------- - - - -In this step, we are going to build multi-modal index and vector store -to index both text and images. The CLIP model is used to generate the -embedding vector for texts and images. - -Initialize CLIP -~~~~~~~~~~~~~~~ - - - -Select inference device - -.. code:: ipython3 - - clip_device = device_widget(default="AUTO", exclude=["NPU"]) - - clip_device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -Class ``OpenVINOClipEmbedding`` in LlamaIndex can support exporting and -loading open_clip models with OpenVINO runtime. for more information, -please refer `Local Embeddings with -OpenVINO `__. - -.. code:: ipython3 - - from llama_index.embeddings.huggingface_openvino import OpenVINOClipEmbedding - - clip_model = OpenVINOClipEmbedding(model_id_or_path=clip_model_path, device=clip_device.value) - -.. code:: ipython3 - - import torch - - if hasattr(torch, "mps") and torch.mps.is_available: - torch.mps.is_available = lambda: False - - from llama_index.core.indices import MultiModalVectorStoreIndex - from llama_index.vector_stores.qdrant import QdrantVectorStore - from llama_index.core import StorageContext, Settings - from llama_index.core.node_parser import SentenceSplitter - from llama_index.core import SimpleDirectoryReader - import qdrant_client - - # Create the MultiModal index - documents = SimpleDirectoryReader(output_folder).load_data() - - # Create a local Qdrant vector store - client = qdrant_client.QdrantClient(":memory:") - - text_store = QdrantVectorStore(client=client, collection_name="text_collection") - image_store = QdrantVectorStore(client=client, collection_name="image_collection") - storage_context = StorageContext.from_defaults(vector_store=text_store, image_store=image_store) - -.. code:: ipython3 - - Settings.embed_model = clip_model - - index = MultiModalVectorStoreIndex.from_documents( - documents, storage_context=storage_context, image_embed_model=Settings.embed_model, transformations=[SentenceSplitter(chunk_size=300, chunk_overlap=30)] - ) - - retriever_engine = index.as_retriever(similarity_top_k=2, image_similarity_top_k=5) - - -.. parsed-literal:: - - WARNING:root:Payload indexes have no effect in the local Qdrant. Please use server Qdrant if you need payload indexes. - WARNING:root:Payload indexes have no effect in the local Qdrant. Please use server Qdrant if you need payload indexes. - - -Search text and image embeddings --------------------------------- - - - -To simply the prompt for VLM, we have to prepare the context of text and -images regarding user’s query. In this step, the most relevant context -will be retrieved from vector DB through multi-modal index. - -.. code:: ipython3 - - from llama_index.core.response.notebook_utils import display_source_node - from llama_index.core.schema import ImageNode - from PIL import Image - import matplotlib.pyplot as plt - import os - - - def plot_images(image_paths): - images_shown = 0 - plt.figure(figsize=(16, 9)) - for img_path in image_paths: - if os.path.isfile(img_path): - image = Image.open(img_path) - - plt.subplot(2, 3, images_shown + 1) - plt.imshow(image) - plt.xticks([]) - plt.yticks([]) - - images_shown += 1 - if images_shown >= 7: - break - - - def retrieve(retriever_engine, query_str): - retrieval_results = retriever_engine.retrieve(query_str) - - retrieved_image = [] - retrieved_text = [] - for res_node in retrieval_results: - if isinstance(res_node.node, ImageNode): - retrieved_image.append(res_node.node.metadata["file_path"]) - else: - display_source_node(res_node, source_length=200) - retrieved_text.append(res_node.text) - - return retrieved_image, retrieved_text - -.. code:: ipython3 - - query_str = "tell me more about gaussian function" - - img, txt = retrieve(retriever_engine=retriever_engine, query_str=query_str) - image_documents = SimpleDirectoryReader(input_dir=output_folder, input_files=img).load_data() - context_str = "".join(txt) - plot_images(img) - print(txt) - - - -**Node ID:** 72d9a990-ba98-4281-be50-42ef56254cad\ **Similarity:** -0.6902350328083398\ **Text:** The basic function underlying a normal -distribution, aka a Gaussian, is E to the negative x squared. But you -might wonder why this function? Of all the expressions we could dream up -that give you s… - - - -**Node ID:** 5989233f-c4dd-4394-802f-5b7d0fa1cacc\ **Similarity:** -0.679477746185225\ **Text:** I’d like to share an especially pleasing -visual way that you can think about this calculation, which hopefully -offers some sense of what makes the E to the negative x squared function -special in th… - - -.. parsed-literal:: - - ["The basic function underlying a normal distribution, aka a Gaussian, is E to the negative x squared. But you might wonder why this function? Of all the expressions we could dream up that give you some symmetric smooth graph with mass concentrated towards the middle, why is it that the theory of probability seems to have a special place in its heart for this particular expression? For the last many videos I've been hinting at an answer to this question, and here we'll finally arrive at something like a satisfying answer. As a quick refresher on where we are, a couple videos ago we talked about the central limit theorem, which describes how as you add multiple copies of a random variable, for example rolling a weighted die many different times or letting a ball bounce off of a peg repeatedly, then the distribution describing that sum tends to look approximately like a normal distribution. What the central limit theorem says is as you make that sum bigger and bigger, under appropriate conditions, that approximation to a normal becomes better and better. But I never explained why this theorem is actually true. We only talked about what it's claiming. In the last video, we started talking about the math involved in adding two random variables. If you have two random variables each following some distribution, then to find the distribution describing the sum of those variables, you compute something known as a convolution between the two original functions. And we spent a lot of time building up two distinct ways to visualize what this convolution operation really is. really is. Today, our basic job is to work through a particular example, which is to ask, what happens when you add two normally distributed random variables, which, as you know by now, is the same as asking what do you get if you compute a convolution between two Gaussian functions?", "I'd like to share an especially pleasing visual way that you can think about this calculation, which hopefully offers some sense of what makes the E to the negative x squared function special in the first place. After we walk through it, we'll talk about how this calculation is one of the steps involved improving the central limit theorem. It's the step that answers the question of why a Gaussian and not something else is the central limit. But first, let's dive in. The full formula for a Gaussian is more complicated than just e to the negative x squared. The exponent is typically written as negative 1 half times x divided by sigma squared, where sigma describes the spread of the distribution, specifically the standard deviation. All of this needs to be multiplied by a fraction on the front, which is there to make sure that the area under the curve is one, making it a valid probability distribution, and if you want to consider distributions that aren't necessarily centered at zero, you would also throw another parameter mu into the exponent like this, although for everything we'll be doing here we just consider centered distributions. Now if you look at our central goal for today, which is to compute a convolution between two Gaussian functions, The direct way to do this would be to take the definition of a convolution, this integral expression we built up last video, and then to plug in for each one of the functions involved, the formula for a Gaussian. It's kind of a lot of symbols when you throw it all together, but more than anything, working this out, is an exercise in completing the square. And there's nothing wrong with that. That will get you the answer that you want."] - - - -.. image:: multimodal-rag-llamaindex-with-output_files/multimodal-rag-llamaindex-with-output_29_3.png - - -Generate final response using VLM ---------------------------------- - - - -Set the RAG prompt template -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - qa_tmpl_str = ( - "Given the provided information, including relevant images and retrieved context from the video, \ - accurately and precisely answer the query without any additional prior knowledge.\n" - "Please ensure honesty and responsibility, refraining from any racist or sexist remarks.\n" - "---------------------\n" - "Context: {context_str}\n" - "---------------------\n" - "Query: {query_str}\n" - "Answer: " - ) - -Initialize VLM -~~~~~~~~~~~~~~ - - - -Select inference device - -.. code:: ipython3 - - vlm_device = device_widget(default="AUTO", exclude=["NPU"]) - - vlm_device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -``OpenVINOMultiModal`` class provides convenient way for running -multimodal model in LlamaIndex. It accepts directory with converted -model and inference device as arguments. For running model with -streaming we will use ``stream_complete`` method. For more information -about the OpenVINO multimodal models support in LlamaIndex, refer to the -`OpenVINOMultiModal -Document `__. - -.. code:: ipython3 - - from transformers import AutoProcessor, AutoTokenizer - - vlm_int4_model_path = "Phi-3.5-vision-instruct/INT4" - - processor = AutoProcessor.from_pretrained(vlm_int4_model_path, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(vlm_int4_model_path) - - - def messages_to_prompt(messages, image_documents): - """ - Prepares the input messages and images. - """ - images = [] - placeholder = "" - - for i, img_doc in enumerate(image_documents, start=1): - images.append(Image.open(img_doc.image_path)) - placeholder += f"<|image_{i}|>\n" - conversation = [ - {"role": "user", "content": placeholder + messages[0].content}, - ] - - prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True) - - inputs = processor(prompt, images, return_tensors="pt") - return inputs - -.. code:: ipython3 - - from llama_index.multi_modal_llms.openvino import OpenVINOMultiModal - - vlm = OpenVINOMultiModal( - model_id_or_path=vlm_int4_model_path, - device=vlm_device.value, - messages_to_prompt=messages_to_prompt, - trust_remote_code=True, - generate_kwargs={"do_sample": False, "eos_token_id": processor.tokenizer.eos_token_id}, - ) - -.. code:: ipython3 - - response = vlm.stream_complete( - prompt=qa_tmpl_str.format(context_str=context_str, query_str=query_str), - image_documents=image_documents, - ) - for r in response: - print(r.delta, end="") - - -.. parsed-literal:: - - A Gaussian function, also known as a normal distribution, is a type of probability distribution that is symmetric and bell-shaped. It is characterized by its mean and standard deviation, which determine the center and spread of the distribution, respectively. The Gaussian function is widely used in statistics and probability theory due to its unique properties and applications in various fields such as physics, engineering, and finance. The function is defined by the equation e to the negative x squared, where x represents the input variable. The graph of a Gaussian function is a smooth curve that approaches the x-axis as it moves away from the center, creating a bell-like shape. The function is also known for its property of being able to describe the distribution of random variables, making it a fundamental concept in probability theory and statistics. - -Interactive Demo ----------------- - - - -Now, you can try to chat with model. Upload video, provide your text -message into ``Input`` field and click ``Submit`` to start -communication. - -.. code:: ipython3 - - import gradio as gr - import base64 - - - def path2base64(path): - with open(path, "rb") as f: - byte_data = f.read() - base64_str = base64.b64encode(byte_data).decode("ascii") - return base64_str - - - def build_index(video_path): - """ - callback function for building index of vector store - - Params: - video_path: path of uploaded video file - Returns: - vector store is ready - - """ - - global retriever_engine - progress = gr.Progress() - progress(None, desc="Video to Images...") - video_to_images(video_path, output_folder) - progress(None, desc="Video to Audio...") - video_to_audio(video_path, output_audio_path) - progress(None, desc="Audio to Texts...") - text_data = audio_to_text(output_audio_path) - - with open(output_folder + "output_text.txt", "w") as file: - file.write(text_data) - print("Text data saved to file") - file.close() - os.remove(output_audio_path) - - progress(0, desc="Building Index...") - documents = SimpleDirectoryReader(output_folder).load_data() - client = qdrant_client.QdrantClient(":memory:") - - text_store = QdrantVectorStore(client=client, collection_name="text_collection") - image_store = QdrantVectorStore(client=client, collection_name="image_collection") - storage_context = StorageContext.from_defaults(vector_store=text_store, image_store=image_store) - index = MultiModalVectorStoreIndex.from_documents( - documents, storage_context=storage_context, image_embed_model=Settings.embed_model, transformations=[SentenceSplitter(chunk_size=300, chunk_overlap=30)] - ) - - retriever_engine = index.as_retriever(similarity_top_k=2, image_similarity_top_k=5) - return "Vector Store is Ready" - - - def search(history): - """ - callback function for searching vector store - - Params: - history: conversation history - Returns: - lists of retrieved images and texts - - """ - progress = gr.Progress() - progress(None, desc="Searching...") - img, txt = retrieve(retriever_engine=retriever_engine, query_str=history[-1][0]) - return img, txt - - - def generate(history, images, texts): - """ - callback function for running chatbot on submit button click - - Params: - history: conversation history - images: list of retrieved images - texts: list of retrieved texts - - """ - progress = gr.Progress() - progress(None, desc="Generating...") - image_documents = SimpleDirectoryReader(input_dir=output_folder, input_files=images).load_data() - - context_str = "".join(texts) - - response = vlm.stream_complete( - prompt=qa_tmpl_str.format(context_str=context_str, query_str=history[-1][0]), - image_documents=image_documents, - ) - images_list = "" - for image in images: - image_base64 = path2base64(image) - images_list += f'' - images_list += "\n" - partial_text = "According to audio and following screenshots from the video: \n" - partial_text += images_list - for r in response: - partial_text += r.delta - history[-1][1] = partial_text - yield history - - - def stop(): - vlm._model.request.cancel() - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/multimodal-rag/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(filepath, build_index, search, generate, stop) - - try: - demo.queue().launch() - except Exception: - demo.queue().launch(share=True) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/multimodal-rag-llamaindex-with-output_files/multimodal-rag-llamaindex-with-output_29_3.png b/docs/notebooks/multimodal-rag-llamaindex-with-output_files/multimodal-rag-llamaindex-with-output_29_3.png deleted file mode 100644 index 82922c0905b2d6..00000000000000 --- a/docs/notebooks/multimodal-rag-llamaindex-with-output_files/multimodal-rag-llamaindex-with-output_29_3.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d7e30eacdf90c997a14ca54472d5d014bea41f3d2a0e0ba2ecc27470579d57ea -size 156997 diff --git a/docs/notebooks/music-generation-with-output.rst b/docs/notebooks/music-generation-with-output.rst deleted file mode 100644 index fd214423f0eb15..00000000000000 --- a/docs/notebooks/music-generation-with-output.rst +++ /dev/null @@ -1,676 +0,0 @@ -Controllable Music Generation with MusicGen and OpenVINO -======================================================== - -MusicGen is a single-stage auto-regressive Transformer model capable of -generating high-quality music samples conditioned on text descriptions -or audio prompts. The text prompt is passed to a text encoder model (T5) -to obtain a sequence of hidden-state representations. These hidden -states are fed to MusicGen, which predicts discrete audio tokens (audio -codes). Finally, audio tokens are then decoded using an audio -compression model (EnCodec) to recover the audio waveform. - -.. figure:: https://user-images.githubusercontent.com/76463150/260439306-81c81c8d-1f9c-41d0-b881-9491766def8e.png - :alt: pipeline - - pipeline - -`The MusicGen model `__ does not -require a self-supervised semantic representation of the text/audio -prompts; it operates over several streams of compressed discrete music -representation with efficient token interleaving patterns, thus -eliminating the need to cascade multiple models to predict a set of -codebooks (e.g. hierarchically or upsampling). Unlike prior models -addressing music generation, it is able to generate all the codebooks in -a single forward pass. - -In this tutorial, we consider how to run the MusicGen model using -OpenVINO. - -We will use a model implementation from the `Hugging Face -Transformers `__ -library. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ - - - `Install requirements <#install-requirements>`__ - - `Imports <#imports>`__ - -- `MusicGen in HF Transformers <#musicgen-in-hf-transformers>`__ - - - `Original Pipeline Inference <#original-pipeline-inference>`__ - -- `Convert models to OpenVINO Intermediate representation (IR) - format <#convert-models-to-openvino-intermediate-representation-ir-format>`__ - - - `0. Set Up Variables <#0--set-up-variables>`__ - - `1. Convert Text Encoder <#1--convert-text-encoder>`__ - - `2. Convert MusicGen Language - Model <#2--convert-musicgen-language-model>`__ - - `3. Convert Audio Decoder <#3--convert-audio-decoder>`__ - -- `Embedding the converted models into the original - pipeline <#embedding-the-converted-models-into-the-original-pipeline>`__ - - - `Select inference device <#select-inference-device>`__ - - `Adapt OpenVINO models to the original - pipeline <#adapt-openvino-models-to-the-original-pipeline>`__ - -- `Try out the converted pipeline <#try-out-the-converted-pipeline>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Install requirements -~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %pip install -q "openvino>=2023.3.0" - %pip install -q "torch>=2.1" "gradio>=4.19" "transformers" packaging --extra-index-url https://download.pytorch.org/whl/cpu - -Imports -~~~~~~~ - - - -.. code:: ipython3 - - from collections import namedtuple - from functools import partial - import gc - from pathlib import Path - from typing import Optional, Tuple - import warnings - - from IPython.display import Audio - import openvino as ov - import numpy as np - import torch - from torch.jit import TracerWarning - from transformers import AutoProcessor, MusicgenForConditionalGeneration - from transformers.modeling_outputs import ( - BaseModelOutputWithPastAndCrossAttentions, - CausalLMOutputWithCrossAttentions, - ) - - # Ignore tracing warnings - warnings.filterwarnings("ignore", category=TracerWarning) - -MusicGen in HF Transformers ---------------------------- - - - -To work with -`MusicGen `__ by Meta -AI, we will use `Hugging Face Transformers -package `__. Transformers -package exposes the ``MusicgenForConditionalGeneration`` class, -simplifying the model instantiation and weights loading. The code below -demonstrates how to create a ``MusicgenForConditionalGeneration`` and -generate a text-conditioned music sample. - -.. code:: ipython3 - - import sys - from packaging.version import parse - - - if sys.version_info < (3, 8): - import importlib_metadata - else: - import importlib.metadata as importlib_metadata - loading_kwargs = {} - - if parse(importlib_metadata.version("transformers")) >= parse("4.40.0"): - loading_kwargs["attn_implementation"] = "eager" - - - # Load the pipeline - model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small", torchscript=True, return_dict=False, **loading_kwargs) - -In the cell below user is free to change the desired music sample -length. - -.. code:: ipython3 - - sample_length = 8 # seconds - - n_tokens = sample_length * model.config.audio_encoder.frame_rate + 3 - sampling_rate = model.config.audio_encoder.sampling_rate - print("Sampling rate is", sampling_rate, "Hz") - - model.to("cpu") - model.eval(); - - -.. parsed-literal:: - - Sampling rate is 32000 Hz - - -Original Pipeline Inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Text Preprocessing prepares the text prompt to be fed into the model, -the ``processor`` object abstracts this step for us. Text tokenization -is performed under the hood, it assigning tokens or IDs to the words; in -other words, token IDs are just indices of the words in the model -vocabulary. It helps the model understand the context of a sentence. - -.. code:: ipython3 - - processor = AutoProcessor.from_pretrained("facebook/musicgen-small") - - inputs = processor( - text=["80s pop track with bassy drums and synth"], - return_tensors="pt", - ) - - audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=n_tokens) - - Audio(audio_values[0].cpu().numpy(), rate=sampling_rate) - - - - -.. raw:: html - - - - - - - -Convert models to OpenVINO Intermediate representation (IR) format ------------------------------------------------------------------- - - - -Model conversion API enables direct conversion of PyTorch models. We -will utilize the ``openvino.convert_model`` method to acquire OpenVINO -IR versions of the models. The method requires a model object and -example input for model tracing. Under the hood, the converter will use -the PyTorch JIT compiler, to build a frozen model graph. - -The pipeline consists of three important parts: - -- The `T5 text encoder `__ - that translates user prompts into vectors in the latent space that - the next model - the MusicGen decoder can utilize. -- The `MusicGen Language - Model `__ - that auto-regressively generates audio tokens (codes). -- The `EnCodec model `__ - (we will use only the decoder part of it) is used to decode the audio - waveform from the audio tokens predicted by the MusicGen Language - Model. - -Let us convert each model step by step. - -0. Set Up Variables -~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - models_dir = Path("./models") - t5_ir_path = models_dir / "t5.xml" - musicgen_0_ir_path = models_dir / "mg_0.xml" - musicgen_ir_path = models_dir / "mg.xml" - audio_decoder_ir_path = models_dir / "encodec.xml" - -1. Convert Text Encoder -~~~~~~~~~~~~~~~~~~~~~~~ - - - -The text encoder is responsible for converting the input prompt, such as -“90s rock song with loud guitars and heavy drums” into an embedding -space that can be fed to the next model. Typically, it is a -transformer-based encoder that maps a sequence of input tokens to a -sequence of text embeddings. - -The input for the text encoder consists of a tensor ``input_ids``, which -contains token indices from the text processed by the tokenizer and -``attention_mask`` that we will ignore as we will process one prompt at -a time and this vector will just consist of ones. - -We use OpenVINO Converter (OVC) below to convert the PyTorch model to -the OpenVINO Intermediate Representation format (IR), which you can -infer later with `OpenVINO -runtime `__ - -.. code:: ipython3 - - if not t5_ir_path.exists(): - t5_ov = ov.convert_model(model.text_encoder, example_input={"input_ids": inputs["input_ids"]}) - - ov.save_model(t5_ov, t5_ir_path) - del t5_ov - gc.collect() - -2. Convert MusicGen Language Model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -This model is the central part of the whole pipeline, it takes the -embedded text representation and generates audio codes that can be then -decoded into actual music. The model outputs several streams of audio -codes - tokens sampled from the pre-trained codebooks representing music -efficiently with a lower frame rate. The model employs innovative codes -intervaling strategy, that makes single-stage generation possible. - -On the 0th generation step the model accepts ``input_ids`` representing -the indices of audio codes, ``encoder_hidden_states`` and -``encoder_attention_mask`` that were provided by the text encoder. - -.. code:: ipython3 - - # Set model config `torchscript` to True, so the model returns a tuple as output - model.decoder.config.torchscript = True - - if not musicgen_0_ir_path.exists(): - decoder_input = { - "input_ids": torch.ones(8, 1, dtype=torch.int64), - "encoder_hidden_states": torch.ones(2, 12, 1024, dtype=torch.float32), - "encoder_attention_mask": torch.ones(2, 12, dtype=torch.int64), - } - mg_ov_0_step = ov.convert_model(model.decoder, example_input=decoder_input) - - ov.save_model(mg_ov_0_step, musicgen_0_ir_path) - del mg_ov_0_step - gc.collect() - -On further iterations, the model is also provided with a -``past_key_values`` argument that contains previous outputs of the -attention block, it allows us to save on computations. But for us, it -means that the signature of the model’s ``forward`` method changed. -Models in OpenVINO IR have frozen calculation graphs and do not allow -optional arguments, that is why the MusicGen model must be converted a -second time, with an increased number of inputs. - -.. code:: ipython3 - - # Add additional argument to the example_input dict - if not musicgen_ir_path.exists(): - # Add `past_key_values` to the converted model signature - decoder_input["past_key_values"] = tuple( - [ - ( - torch.ones(2, 16, 1, 64, dtype=torch.float32), - torch.ones(2, 16, 1, 64, dtype=torch.float32), - torch.ones(2, 16, 12, 64, dtype=torch.float32), - torch.ones(2, 16, 12, 64, dtype=torch.float32), - ) - ] - * 24 - ) - - mg_ov = ov.convert_model(model.decoder, example_input=decoder_input) - for input in mg_ov.inputs[3:]: - input.get_node().set_partial_shape(ov.PartialShape([-1, 16, -1, 64])) - input.get_node().set_element_type(ov.Type.f32) - - mg_ov.validate_nodes_and_infer_types() - - ov.save_model(mg_ov, musicgen_ir_path) - del mg_ov - gc.collect() - -3. Convert Audio Decoder -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The audio decoder which is a part of the EnCodec model is used to -recover the audio waveform from the audio tokens predicted by the -MusicGen decoder. To learn more about the model please refer to the -corresponding `OpenVINO example `__. - -.. code:: ipython3 - - if not audio_decoder_ir_path.exists(): - - class AudioDecoder(torch.nn.Module): - def __init__(self, model): - super().__init__() - self.model = model - - def forward(self, output_ids): - return self.model.decode(output_ids, [None]) - - audio_decoder_input = {"output_ids": torch.ones((1, 1, 4, n_tokens - 3), dtype=torch.int64)} - - with torch.no_grad(): - audio_decoder_ov = ov.convert_model(AudioDecoder(model.audio_encoder), example_input=audio_decoder_input) - ov.save_model(audio_decoder_ov, audio_decoder_ir_path) - del audio_decoder_ov - gc.collect() - -Embedding the converted models into the original pipeline ---------------------------------------------------------- - - - -OpenVINO™ Runtime Python API is used to compile the model in OpenVINO IR -format. The -`Core `__ -class provides access to the OpenVINO Runtime API. The ``core`` object, -which is an instance of the ``Core`` class represents the API and it is -used to compile the model. - -.. code:: ipython3 - - core = ov.Core() - -Select inference device -^^^^^^^^^^^^^^^^^^^^^^^ - - - -Select device that will be used to do models inference using OpenVINO -from the dropdown list: - -.. code:: ipython3 - - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import device_widget - - device = device_widget() - - device - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("music-generation.ipynb") - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO') - - - -Adapt OpenVINO models to the original pipeline -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Here we create wrapper classes for all three OpenVINO models that we -want to embed in the original inference pipeline. Here are some of the -things to consider when adapting an OV model: - Make sure that -parameters passed by the original pipeline are forwarded to the compiled -OV model properly; sometimes the OV model uses only a portion of the -input arguments and some are ignored, sometimes you need to convert the -argument to another data type or unwrap some data structures such as -tuples or dictionaries. - Guarantee that the wrapper class returns -results to the pipeline in an expected format. In the example below you -can see how we pack OV model outputs into special classes declared in -the HF repo. - Pay attention to the model method used in the original -pipeline for calling the model - it may be not the ``forward`` method! -Refer to the ``AudioDecoderWrapper`` to see how we wrap OV model -inference into the ``decode`` method. - -.. code:: ipython3 - - class TextEncoderWrapper(torch.nn.Module): - def __init__(self, encoder_ir, config): - super().__init__() - self.encoder = core.compile_model(encoder_ir, device.value) - self.config = config - - def forward(self, input_ids, **kwargs): - last_hidden_state = self.encoder(input_ids)[self.encoder.outputs[0]] - last_hidden_state = torch.tensor(last_hidden_state) - return BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=last_hidden_state) - - - class MusicGenWrapper(torch.nn.Module): - def __init__( - self, - music_gen_lm_0_ir, - music_gen_lm_ir, - config, - num_codebooks, - build_delay_pattern_mask, - apply_delay_pattern_mask, - ): - super().__init__() - self.music_gen_lm_0 = core.compile_model(music_gen_lm_0_ir, device.value) - self.music_gen_lm = core.compile_model(music_gen_lm_ir, device.value) - self.config = config - self.num_codebooks = num_codebooks - self.build_delay_pattern_mask = build_delay_pattern_mask - self.apply_delay_pattern_mask = apply_delay_pattern_mask - - def forward( - self, - input_ids: torch.LongTensor = None, - encoder_hidden_states: torch.FloatTensor = None, - encoder_attention_mask: torch.LongTensor = None, - past_key_values: Optional[Tuple[torch.FloatTensor]] = None, - **kwargs - ): - if past_key_values is None: - model = self.music_gen_lm_0 - arguments = (input_ids, encoder_hidden_states, encoder_attention_mask) - else: - model = self.music_gen_lm - arguments = ( - input_ids, - encoder_hidden_states, - encoder_attention_mask, - *past_key_values, - ) - - output = model(arguments) - return CausalLMOutputWithCrossAttentions( - logits=torch.tensor(output[model.outputs[0]]), - past_key_values=tuple([output[model.outputs[i]] for i in range(1, 97)]), - ) - - - class AudioDecoderWrapper(torch.nn.Module): - def __init__(self, decoder_ir, config): - super().__init__() - self.decoder = core.compile_model(decoder_ir, device.value) - self.config = config - self.output_type = namedtuple("AudioDecoderOutput", ["audio_values"]) - - def decode(self, output_ids, audio_scales): - output = self.decoder(output_ids)[self.decoder.outputs[0]] - return self.output_type(audio_values=torch.tensor(output)) - -Now we initialize the wrapper objects and load them to the HF pipeline - -.. code:: ipython3 - - text_encode_ov = TextEncoderWrapper(t5_ir_path, model.text_encoder.config) - musicgen_decoder_ov = MusicGenWrapper( - musicgen_0_ir_path, - musicgen_ir_path, - model.decoder.config, - model.decoder.num_codebooks, - model.decoder.build_delay_pattern_mask, - model.decoder.apply_delay_pattern_mask, - ) - audio_encoder_ov = AudioDecoderWrapper(audio_decoder_ir_path, model.audio_encoder.config) - - del model.text_encoder - del model.decoder - del model.audio_encoder - gc.collect() - - model.text_encoder = text_encode_ov - model.decoder = musicgen_decoder_ov - model.audio_encoder = audio_encoder_ov - - - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_attention_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - decoder_delay_pattern_mask=None, - guidance_scale=None, - **kwargs, - ): - if decoder_delay_pattern_mask is None: - ( - decoder_input_ids, - decoder_delay_pattern_mask, - ) = self.decoder.build_delay_pattern_mask( - decoder_input_ids, - self.generation_config.pad_token_id, - max_length=self.generation_config.max_length, - ) - - # apply the delay pattern mask - decoder_input_ids = self.decoder.apply_delay_pattern_mask(decoder_input_ids, decoder_delay_pattern_mask) - - if guidance_scale is not None and guidance_scale > 1: - # for classifier free guidance we need to replicate the decoder args across the batch dim (we'll split these - # before sampling) - decoder_input_ids = decoder_input_ids.repeat((2, 1)) - if decoder_attention_mask is not None: - decoder_attention_mask = decoder_attention_mask.repeat((2, 1)) - - if past_key_values is not None: - # cut decoder_input_ids if past is used - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "decoder_attention_mask": decoder_attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, - } - - - model.prepare_inputs_for_generation = partial(prepare_inputs_for_generation, model) - -We can now infer the pipeline backed by OpenVINO models. - -.. code:: ipython3 - - processor = AutoProcessor.from_pretrained("facebook/musicgen-small") - - inputs = processor( - text=["80s pop track with bassy drums and synth"], - return_tensors="pt", - ) - - audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=n_tokens) - - Audio(audio_values[0].cpu().numpy(), rate=sampling_rate) - - - - -.. raw:: html - - - - - - - -Try out the converted pipeline ------------------------------- - - - -The demo app below is created using `Gradio -package `__ - -.. code:: ipython3 - - def _generate(prompt): - inputs = processor( - text=[ - prompt, - ], - padding=True, - return_tensors="pt", - ) - audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=n_tokens) - waveform = audio_values[0].cpu().squeeze() * 2**15 - return (sampling_rate, waveform.numpy().astype(np.int16)) - -.. code:: ipython3 - - import gradio as gr - - demo = gr.Interface( - fn=_generate, - inputs=[ - gr.Textbox(label="Text Prompt"), - ], - outputs=["audio"], - examples=[ - ["80s pop track with bassy drums and synth"], - ["Earthy tones, environmentally conscious, ukulele-infused, harmonic, breezy, easygoing, organic instrumentation, gentle grooves"], - ["90s rock song with loud guitars and heavy drums"], - ["Heartful EDM with beautiful synths and chords"], - ], - allow_flagging="never", - ) - try: - demo.launch(debug=True) - except Exception: - demo.launch(share=True, debug=True) - - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ diff --git a/docs/notebooks/named-entity-recognition-with-output.rst b/docs/notebooks/named-entity-recognition-with-output.rst deleted file mode 100644 index e17b9ebe3066a2..00000000000000 --- a/docs/notebooks/named-entity-recognition-with-output.rst +++ /dev/null @@ -1,417 +0,0 @@ -Named entity recognition with OpenVINO™ -======================================= - -The Named Entity Recognition(NER) is a natural language processing -method that involves the detecting of key information in the -unstructured text and categorizing it into pre-defined categories. These -categories or named entities refer to the key subjects of text, such as -names, locations, companies and etc. - -NER is a good method for the situations when a high-level overview of a -large amount of text is needed. NER can be helpful with such task as -analyzing key information in unstructured text or automates the -information extraction of large amounts of data. - -This tutorial shows how to perform named entity recognition using -OpenVINO. We will use the pre-trained model -`elastic/distilbert-base-cased-finetuned-conll03-english `__. -It is DistilBERT based model, trained on -`conll03 english dataset `__. -The model can recognize four named entities in text: persons, locations, -organizations and names of miscellaneous entities that do not belong to -the previous three groups. The model is sensitive to capital letters. - -To simplify the user experience, the `Hugging Face -Optimum `__ library is used to -convert the model to OpenVINO™ IR format and quantize it. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Download the NER model <#download-the-ner-model>`__ -- `Quantize the model, using Hugging Face Optimum - API <#quantize-the-model-using-hugging-face-optimum-api>`__ -- `Compare the Original and Quantized - Models <#compare-the-original-and-quantized-models>`__ - - - `Compare performance <#compare-performance>`__ - - `Compare size of the models <#compare-size-of-the-models>`__ - -- `Prepare demo for Named Entity Recognition OpenVINO - Runtime <#prepare-demo-for-named-entity-recognition-openvino-runtime>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "diffusers>=0.17.1" "openvino>=2023.1.0" "nncf>=2.5.0" "gradio>=4.19" "onnx>=1.11.0,<1.16.2" "transformers>=4.33.0" "torch>=2.1" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" - -Download the NER model ----------------------- - - - -We load the -`distilbert-base-cased-finetuned-conll03-english `__ -model from the `Hugging Face Hub `__ with -`Hugging Face Transformers -library `__\ and Optimum -Intel with OpenVINO integration. - -``OVModelForTokenClassification`` is represent model class for Named -Entity Recognition task in Optimum Intel. Model class initialization -starts with calling ``from_pretrained`` method. For conversion original -PyTorch model to OpenVINO format on the fly, ``export=True`` parameter -should be used. To easily save the model, you can use the -``save_pretrained()`` method. After saving the model on disk, we can use -pre-converted model for next usage, and speedup deployment process. - -.. code:: ipython3 - - from pathlib import Path - from transformers import AutoTokenizer - from optimum.intel import OVModelForTokenClassification - - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("named-entity-recognition.ipynb") - - original_ner_model_dir = Path("original_ner_model") - - model_id = "elastic/distilbert-base-cased-finetuned-conll03-english" - if not original_ner_model_dir.exists(): - model = OVModelForTokenClassification.from_pretrained(model_id, export=True) - - model.save_pretrained(original_ner_model_dir) - else: - model = OVModelForTokenClassification.from_pretrained(model_id, export=True) - - tokenizer = AutoTokenizer.from_pretrained(model_id) - -Quantize the model, using Hugging Face Optimum API --------------------------------------------------- - - - -Post-training static quantization introduces an additional calibration -step where data is fed through the network in order to compute the -activations quantization parameters. For quantization it will be used -`Hugging Face Optimum Intel -API `__. - -To handle the NNCF quantization process we use class -`OVQuantizer `__. -The quantization with Hugging Face Optimum Intel API contains the next -steps: \* Model class initialization starts with calling -``from_pretrained()`` method. \* Next we create calibration dataset with -``get_calibration_dataset()`` to use for the post-training static -quantization calibration step. \* After we quantize a model and save the -resulting model in the OpenVINO IR format to save_directory with -``quantize()`` method. \* Then we load the quantized model. The Optimum -Inference models are API compatible with Hugging Face Transformers -models and we can just replace ``AutoModelForXxx`` class with the -corresponding ``OVModelForXxx`` class. So we use -``OVModelForTokenClassification`` to load the model. - -.. code:: ipython3 - - from functools import partial - from optimum.intel import OVQuantizer, OVConfig, OVQuantizationConfig - - from optimum.intel import OVModelForTokenClassification - - - def preprocess_fn(data, tokenizer): - examples = [] - for data_chunk in data["tokens"]: - examples.append(" ".join(data_chunk)) - - return tokenizer(examples, padding=True, truncation=True, max_length=128) - - - quantizer = OVQuantizer.from_pretrained(model) - calibration_dataset = quantizer.get_calibration_dataset( - "conll2003", - preprocess_function=partial(preprocess_fn, tokenizer=tokenizer), - num_samples=100, - dataset_split="train", - preprocess_batch=True, - trust_remote_code=True, - ) - - # The directory where the quantized model will be saved - quantized_ner_model_dir = "quantized_ner_model" - - # Apply static quantization and save the resulting model in the OpenVINO IR format - ov_config = OVConfig(quantization_config=OVQuantizationConfig(num_samples=len(calibration_dataset))) - quantizer.quantize( - calibration_dataset=calibration_dataset, - save_directory=quantized_ner_model_dir, - ov_config=ov_config, - ) - - -.. parsed-literal:: - - /home/ea/miniconda3/lib/python3.11/site-packages/datasets/load.py:2516: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0. - You can remove this warning by passing 'token=' instead. - warnings.warn( - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - INFO:nncf:18 ignored nodes were found by name in the NNCFGraph - INFO:nncf:25 ignored nodes were found by name in the NNCFGraph - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - # Load the quantized model - optimized_model = OVModelForTokenClassification.from_pretrained(quantized_ner_model_dir, device=device.value) - - -.. parsed-literal:: - - Compiling the model to AUTO ... - - -Compare the Original and Quantized Models ------------------------------------------ - - - -Compare the original -`distilbert-base-cased-finetuned-conll03-english `__ -model with quantized and converted to OpenVINO IR format models to see -the difference. - -Compare performance -~~~~~~~~~~~~~~~~~~~ - - - -As the Optimum Inference models are API compatible with Hugging Face -Transformers models, we can just use ``pipleine()`` from `Hugging Face -Transformers API `__ for -inference. - -.. code:: ipython3 - - from transformers import pipeline - - ner_pipeline_optimized = pipeline("token-classification", model=optimized_model, tokenizer=tokenizer) - - ner_pipeline_original = pipeline("token-classification", model=model, tokenizer=tokenizer) - -.. code:: ipython3 - - import time - import numpy as np - - - def calc_perf(ner_pipeline): - inference_times = [] - - for data in calibration_dataset: - text = " ".join(data["tokens"]) - start = time.perf_counter() - ner_pipeline(text) - end = time.perf_counter() - inference_times.append(end - start) - - return np.median(inference_times) - - - print(f"Median inference time of quantized model: {calc_perf(ner_pipeline_optimized)} ") - - print(f"Median inference time of original model: {calc_perf(ner_pipeline_original)} ") - - -.. parsed-literal:: - - Median inference time of quantized model: 0.0063508255407214165 - Median inference time of original model: 0.007429798366501927 - - -Compare size of the models -~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from pathlib import Path - - fp_model_file = Path(original_ner_model_dir) / "openvino_model.bin" - print(f"Size of original model in Bytes is {fp_model_file.stat().st_size}") - print(f'Size of quantized model in Bytes is {Path(quantized_ner_model_dir, "openvino_model.bin").stat().st_size}') - - -.. parsed-literal:: - - Size of original model in Bytes is 260795516 - Size of quantized model in Bytes is 65802712 - - -Prepare demo for Named Entity Recognition OpenVINO Runtime ----------------------------------------------------------- - - - -Now, you can try NER model on own text. Put your sentence to input text -box, click Submit button, the model label the recognized entities in the -text. - -.. code:: ipython3 - - import gradio as gr - - - def run_ner(text): - output = ner_pipeline_optimized(text) - return {"text": text, "entities": output} - - - demo = gr.Interface( - fn=run_ner, - inputs=gr.Textbox(placeholder="Enter sentence here...", label="Input Text"), - outputs=gr.HighlightedText(label="Output Text"), - examples=[ - "My name is Wolfgang and I live in Berlin.", - ], - allow_flagging="never", - ) - - try: - demo.launch(debug=False) - except Exception: - demo.launch(share=True, debug=False) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/nano-llava-multimodal-chatbot-with-output.rst b/docs/notebooks/nano-llava-multimodal-chatbot-with-output.rst deleted file mode 100644 index 1b9e67989eba0e..00000000000000 --- a/docs/notebooks/nano-llava-multimodal-chatbot-with-output.rst +++ /dev/null @@ -1,663 +0,0 @@ -Visual-language assistant with nanoLLaVA and OpenVINO -===================================================== - -nanoLLaVA is a “small but mighty” 1B vision-language model designed to -run efficiently on edge devices. It uses -`SigLIP-400m `__ -as Image Encoder and -`Qwen1.5-0.5B `__ as LLM. In -this tutorial, we consider how to convert and run nanoLLaVA model using -OpenVINO. Additionally, we will optimize model using -`NNCF `__ - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Select Model <#select-model>`__ -- `Convert and Optimize model <#convert-and-optimize-model>`__ - - - `Convert model to OpenVINO IR - format <#convert-model-to-openvino-ir-format>`__ - - `Compress Model weights to 4 and 8 bits using - NNCF <#compress-model-weights-to-4-and-8-bits-using-nncf>`__ - - `Image Encoder <#image-encoder>`__ - - `Language Model <#language-model>`__ - -- `Prepare model inference - pipeline <#prepare-model-inference-pipeline>`__ -- `Run OpenVINO Model Inference <#run-openvino-model-inference>`__ - - - `Select device <#select-device>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "torch>=2.1" "transformers>=4.45" "accelerate" "pillow" "gradio>=4.26" "tqdm" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "nncf>=2.14" - %pip install -q -U "openvino-tokenizers[transformers]>=2024.5.0" "openvino>=2024.5.0" - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" - -.. code:: ipython3 - - from pathlib import Path - import requests - - helper_file = Path("ov_nano_llava_helper.py") - cmd_helper_file = Path("cmd_helper.py") - - if not helper_file.exists(): - r = requests.get( - url=f"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/nano-llava-multimodal-chatbot/{helper_file.name}" - ) - helper_file.open("w").write(r.text) - - if not cmd_helper_file.exists(): - r = requests.get(url=f"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/{cmd_helper_file.name}") - cmd_helper_file.open("w").write(r.text) - -Select Model ------------- - - - -The tutorial supports the following models from Phi-3 model family: - -- `nanoLLaVA `__ -- `nanoLLaVA-1.5 `__ - -You can select one from the provided options below. - -.. code:: ipython3 - - import ipywidgets as widgets - - model_ids = ["qnguyen3/nanoLLaVA", "qnguyen3/nanoLLaVA-1.5"] - - model_dropdown = widgets.Dropdown( - options=model_ids, - value=model_ids[0], - description="Model:", - disabled=False, - ) - - model_dropdown - - - - -.. parsed-literal:: - - Dropdown(description='Model:', options=('qnguyen3/nanoLLaVA', 'qnguyen3/nanoLLaVA-1.5'), value='qnguyen3/nanoL… - - - -Download PyTorch model ----------------------- - - - -.. code:: ipython3 - - from ov_nano_llava_helper import converted_model_exists, copy_model_files - - model_id = model_dropdown.value - model_dir = Path(model_id.split("/")[-1]) - ov_model_dir = Path("ov_" + model_dir.name) / "FP16" - -Convert and Optimize model --------------------------- - - - -Our model conversion and optimization consist of following steps: 1. -Convert model to OpenVINO format and save it on disk. 2. Compress model -weights using NNCF - -Let’s consider each step deeply. - -Convert model to OpenVINO IR format -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -NanoLLaVA implementation is based on `HuggingFace -Transformers `__ -library. For convenience, we will use OpenVINO integration with -HuggingFace Optimum. `Optimum -Intel `__ is the -interface between the Transformers and Diffusers libraries and the -different tools and libraries provided by Intel to accelerate end-to-end -pipelines on Intel architectures. - -Among other use cases, Optimum Intel provides a simple interface to -optimize your Transformers and Diffusers models, convert them to the -OpenVINO Intermediate Representation (IR) format and run inference using -OpenVINO Runtime. ``optimum-cli`` provides command line interface for -model conversion and optimization. - -General command format: - -.. code:: bash - - optimum-cli export openvino --model --task - -where task is task to export the model for, if not specified, the task -will be auto-inferred based on the model. You can find a mapping between -tasks and model classes in Optimum TaskManager -`documentation `__. -Additionally, you can specify weights compression using -``--weight-format`` argument with one of following options: ``fp32``, -``fp16``, ``int8`` and ``int4``. For int8 and int4 -`nncf `__ will be used for -weight compression. More details about model export provided in `Optimum -Intel -documentation `__. - -.. code:: ipython3 - - from cmd_helper import optimum_cli - - if not converted_model_exists(ov_model_dir): - optimum_cli(model_id, ov_model_dir, additional_args={"task": "image-text-to-text", "trust-remote-code": "", "weight-format": "fp16"}) - -Compress Model weights to 4 and 8 bits using NNCF -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -For reducing memory consumption, weights compression optimization can be -applied using `NNCF `__. Weight -compression aims to reduce the memory footprint of a model. It can also -lead to significant performance improvement for large memory-bound -models, such as Large Language Models (LLMs). LLMs and other models, -which require extensive memory to store the weights during inference, -can benefit from weight compression in the following ways: - -- enabling the inference of exceptionally large models that cannot be - accommodated in the memory of the device; - -- improving the inference performance of the models by reducing the - latency of the memory access when computing the operations with - weights, for example, Linear layers. - -`Neural Network Compression Framework -(NNCF) `__ provides 4-bit / -8-bit mixed weight quantization as a compression method primarily -designed to optimize LLMs. The main difference between weights -compression and full model quantization (post-training quantization) is -that activations remain floating-point in the case of weights -compression which leads to a better accuracy. Weight compression for -LLMs provides a solid inference performance improvement which is on par -with the performance of the full model quantization. In addition, weight -compression is data-free and does not require a calibration dataset, -making it easy to use. - -``nncf.compress_weights`` function can be used for performing weights -compression. The function accepts an OpenVINO model and other -compression parameters. Compared to INT8 compression, INT4 compression -improves performance even more, but introduces a minor drop in -prediction quality. - -More details about weights compression, can be found in `OpenVINO -documentation `__. - -Please select below whether you would like to run INT4 weight -compression instead of INT8 weight compression. - -.. code:: ipython3 - - import ipywidgets as widgets - - compression_mode = widgets.Dropdown( - options=["INT4", "INT8"], - value="INT4", - description="Compression mode:", - disabled=False, - ) - - compression_mode - - - - -.. parsed-literal:: - - Dropdown(description='Compression mode:', options=('INT4', 'INT8'), value='INT4') - - - -.. code:: ipython3 - - import nncf - import openvino as ov - - core = ov.Core() - - if compression_mode.value == "INT4": - ov_compressed_model_dir = ov_model_dir.parent / "INT4" - llava_wc_parameters = dict(mode=nncf.CompressWeightsMode.INT4_ASYM, group_size=128, ratio=0.8) - else: - ov_compressed_model_dir = ov_model_dir.parent / "INT8" - llava_wc_parameters = dict(mode=nncf.CompressWeightsMode.INT8) - - image_encoder_wc_parameters = dict(mode=nncf.CompressWeightsMode.INT8) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -Image Encoder -~~~~~~~~~~~~~ - - - -Image Encoder is represented in nanoLLaVA by pretrained SigLIP model. -Image encoder is responsible for encoding input images into embedding -space. Code bellow demonstrates how to apply weights compression for -image encoder model. - -.. code:: ipython3 - - import gc - - compressed_vision_encoder_path = ov_compressed_model_dir / "openvino_vision_embeddings_model.xml" - vision_encoder_path = ov_model_dir / "openvino_vision_embeddings_model.xml" - if not compressed_vision_encoder_path.exists(): - ov_vision_encoder = core.read_model(vision_encoder_path) - ov_compressed_vision_encoder = nncf.compress_weights(ov_vision_encoder, **image_encoder_wc_parameters) - ov.save_model(ov_compressed_vision_encoder, compressed_vision_encoder_path) - del ov_compressed_vision_encoder - del ov_vision_encoder - gc.collect(); - - -.. parsed-literal:: - - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/nncf/quantization/quantize_model.py:432: FutureWarning: `CompressWeightsMode.INT8` is deprecated. Please, use `CompressWeightsMode.INT8_ASYM` as value instead. - warning_deprecated( - 2024-10-17 19:50:20.293849: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-10-17 19:50:20.295807: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used. - 2024-10-17 19:50:20.332354: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-10-17 19:50:21.104169: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (159 / 159) │ 100% (159 / 159) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -Language Model -~~~~~~~~~~~~~~ - - - -Language Model is responsible for generation answer in LLaVA. This part -is very similar to standard LLM for text generation. Our model uses -`Qwen/Qwen1.5-0.5B `__ as base -LLM. - -.. code:: ipython3 - - compressed_llm_path = ov_compressed_model_dir / "openvino_language_model.xml" - llm_path = ov_model_dir / "openvino_language_model.xml" - - if not compressed_llm_path.exists(): - ov_llm = core.read_model(llm_path) - ov_compressed_llm = nncf.compress_weights(ov_llm, **llava_wc_parameters) - ov.save_model(ov_compressed_llm, compressed_llm_path) - del ov_compressed_llm - del ov_llm - gc.collect() - - copy_model_files(ov_model_dir, ov_compressed_model_dir) - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 47% (48 / 169) │ 20% (47 / 168) │ - ├────────────────┼─────────────────────────────┼────────────────────────────────────────┤ - │ 4 │ 53% (121 / 169) │ 80% (121 / 168) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -Prepare model inference pipeline --------------------------------- - - - -OpenVINO integration with Optimum Intel provides ready-to-use API for -model inference that can be used for smooth integration with -transformers-based solutions. For loading pixtral model, we will use -``OVModelForVisualCausalLM`` class that have compatible interface with -Transformers Pixtral implementation. For loading a model, -``from_pretrained`` method should be used. It accepts path to the model -directory or model_id from HuggingFace hub (if model is not converted to -OpenVINO format, conversion will be triggered automatically). -Additionally, we can provide an inference device, quantization config -(if model has not been quantized yet) and device-specific OpenVINO -Runtime configuration. More details about model inference with Optimum -Intel can be found in -`documentation `__. - -Run OpenVINO Model Inference ----------------------------- - - - -Select device -~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import device_widget - - device = device_widget("CPU", exclude=["NPU"]) - - device - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("nano-llava-multimodal-chatbot.ipynb") - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -Optimum Intel provides Transformers-like interface for inference -OpenVINO models that allows smooth integration into user application, -where you need just replace model class, other parts of pipeline - -preprocessing and postprocessing code remains the same. It means that we -can use the same tokenizer and image processor that provided with model. - -.. code:: ipython3 - - from optimum.intel.openvino import OVModelForVisualCausalLM - from transformers import AutoConfig, AutoTokenizer, AutoProcessor, TextStreamer - - # prepare tokenizer - tokenizer = AutoTokenizer.from_pretrained(ov_compressed_model_dir, trust_remote_code=True) - - # prepare image processor - config = AutoConfig.from_pretrained(ov_compressed_model_dir, trust_remote_code=True) - processor = AutoProcessor.from_pretrained(config.mm_vision_tower) - - # initialize OpenVINO model inference class - ov_model = OVModelForVisualCausalLM.from_pretrained(ov_compressed_model_dir, device=device.value, trust_remote_code=True) - -.. code:: ipython3 - - from ov_nano_llava_helper import process_images, process_text_input - from PIL import Image - - prompt = "Describe this image in detail" - - messages = [{"role": "user", "content": f"\n{prompt}"}] - text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - test_image = Path("nanollava.png") - - if not test_image.exists(): - url = "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/8bf7d9f2-018a-4498-bec4-55f17c273ecc" - image = Image.open(requests.get(url, stream=True).raw) - image.save(test_image) - else: - image = Image.open(test_image) - image_tensor = process_images(image, None, processor) - input_ids, attention_mask = process_text_input(text, tokenizer) - - streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) - - display(image) - print(f"Question:\n{prompt}") - print("Answer:") - - output_ids = ov_model.generate(input_ids, attention_mask=attention_mask, pixel_values=image_tensor, max_new_tokens=128, use_cache=True, streamer=streamer) - - - -.. image:: nano-llava-multimodal-chatbot-with-output_files/nano-llava-multimodal-chatbot-with-output_22_0.png - - -.. parsed-literal:: - - Setting `pad_token_id` to `eos_token_id`:None for open-end generation. - - -.. parsed-literal:: - - Question: - Describe this image in detail - Answer: - The image showcases a playful scene with a white, fluffy llama as the main subject. The llama appears to be in the process of making a face, with its mouth open, revealing its pink tongue. The llama's face is characterized by a small nose and a pair of eyes that are quite expressive. It also has a distinct ear that stands out due to its pink color. The llama's fur is fluffy and white, and it has a small black nose. - The llama is depicted in a dynamic pose, with its hind leg visible. The leg appears to be positioned in a way that suggests the llama is in the middle of a movement. - - -Interactive demo ----------------- - - - -.. code:: ipython3 - - from transformers import TextIteratorStreamer, StoppingCriteria - from threading import Thread - import torch - - - class KeywordsStoppingCriteria(StoppingCriteria): - def __init__(self, keywords, tokenizer, input_ids): - self.keywords = keywords - self.keyword_ids = [] - self.max_keyword_len = 0 - for keyword in keywords: - cur_keyword_ids = tokenizer(keyword).input_ids - if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id: - cur_keyword_ids = cur_keyword_ids[1:] - if len(cur_keyword_ids) > self.max_keyword_len: - self.max_keyword_len = len(cur_keyword_ids) - self.keyword_ids.append(torch.tensor(cur_keyword_ids)) - self.tokenizer = tokenizer - self.start_len = input_ids.shape[1] - - def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: - offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len) - self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids] - for keyword_id in self.keyword_ids: - truncated_output_ids = output_ids[0, -keyword_id.shape[0] :] - if torch.equal(truncated_output_ids, keyword_id): - return True - outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0] - for keyword in self.keywords: - if keyword in outputs: - return True - return False - - def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: - outputs = [] - for i in range(output_ids.shape[0]): - outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores)) - return all(outputs) - - - def bot_streaming(message, history): - messages = [] - if message["files"]: - image = message["files"][-1]["path"] if isinstance(message["files"][-1], dict) else message["files"][-1] - else: - for _, hist in enumerate(history): - if isinstance(hist[0], tuple): - image = hist[0][0] - - if len(history) > 0 and image is not None: - messages.append({"role": "user", "content": f"\n{history[1][0]}"}) - messages.append({"role": "assistant", "content": history[1][1]}) - for human, assistant in history[2:]: - if assistant is None: - continue - messages.append({"role": "user", "content": human}) - messages.append({"role": "assistant", "content": assistant}) - messages.append({"role": "user", "content": message["text"]}) - elif len(history) > 0 and image is None: - for human, assistant in history: - if assistant is None: - continue - messages.append({"role": "user", "content": human}) - messages.append({"role": "assistant", "content": assistant}) - messages.append({"role": "user", "content": message["text"]}) - elif len(history) == 0 and image is not None: - messages.append({"role": "user", "content": f"\n{message['text']}"}) - elif len(history) == 0 and image is None: - messages.append({"role": "user", "content": message["text"]}) - - print(messages) - image = Image.open(image).convert("RGB") - text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - image_tensor = process_images(image, None, processor) - input_ids, attention_mask = process_text_input(text, tokenizer) - stop_str = "<|im_end|>" - keywords = [stop_str] - stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) - streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) - generation_kwargs = dict( - input_ids=input_ids, - attention_mask=attention_mask, - pixel_values=image_tensor, - streamer=streamer, - max_new_tokens=128, - stopping_criteria=[stopping_criteria], - temperature=0.01, - ) - thread = Thread(target=ov_model.generate, kwargs=generation_kwargs) - thread.start() - - buffer = "" - for new_text in streamer: - buffer += new_text - generated_text_without_prompt = buffer[:] - yield generated_text_without_prompt - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/nano-llava-multimodal-chatbot/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=bot_streaming) - - try: - demo.launch(debug=True) - except Exception: - demo.launch(share=True, debug=True) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/nano-llava-multimodal-chatbot-with-output_files/nano-llava-multimodal-chatbot-with-output_22_0.jpg b/docs/notebooks/nano-llava-multimodal-chatbot-with-output_files/nano-llava-multimodal-chatbot-with-output_22_0.jpg deleted file mode 100644 index b8bcb19d9398ee..00000000000000 --- a/docs/notebooks/nano-llava-multimodal-chatbot-with-output_files/nano-llava-multimodal-chatbot-with-output_22_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:795e7d90a3a734604a845cbab2e038d9ee5b713ab33390183562af970605fe42 -size 65057 diff --git a/docs/notebooks/nano-llava-multimodal-chatbot-with-output_files/nano-llava-multimodal-chatbot-with-output_22_0.png b/docs/notebooks/nano-llava-multimodal-chatbot-with-output_files/nano-llava-multimodal-chatbot-with-output_22_0.png deleted file mode 100644 index f84a025f54e780..00000000000000 --- a/docs/notebooks/nano-llava-multimodal-chatbot-with-output_files/nano-llava-multimodal-chatbot-with-output_22_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:425c949d0a4a907c2ffdaeef53fcd948f1b184cfe325bf69b76994a5a9b6c1d0 -size 490750 diff --git a/docs/notebooks/notebooks_with_binder_buttons.txt b/docs/notebooks/notebooks_with_binder_buttons.txt deleted file mode 100644 index 332de977617a66..00000000000000 --- a/docs/notebooks/notebooks_with_binder_buttons.txt +++ /dev/null @@ -1,38 +0,0 @@ -3D-pose-estimation -3D-segmentation-point-clouds -action-recognition-webcam -async-api -auto-device -convert-to-openvino -depth-anything -detectron2-to-openvino -fast-segment-anything -handwritten-ocr -hello-detection -hello-segmentation -hello-world -hugging-face-hub -image-classification-quantization -knowledge-graphs-conve -magika-content-type-recognition -meter-reader -music-generation -object-detection -openvino-api -openvino-tokenizers -openvoice -paddle-ocr-webcam -paddle-to-openvino-classification -person-tracking -pose-estimation -pytorch-onnx-to-openvino -291-rmbg-background-removal -style-transfer -tensorflow-classification-to-openvino -tensorflow-hub -tensorflow-instance-segmentation-to-openvino -tensorflow-object-detection-to-openvino -tflite-selfie-segmentation -vehicle-detection-and-recognition -vision-background-removal -vision-monodepth diff --git a/docs/notebooks/notebooks_with_colab_buttons.txt b/docs/notebooks/notebooks_with_colab_buttons.txt deleted file mode 100644 index 2c412b89825c55..00000000000000 --- a/docs/notebooks/notebooks_with_colab_buttons.txt +++ /dev/null @@ -1,67 +0,0 @@ -3D-segmentation-point-clouds -async-api -auto-device -clip-language-saliency-map -convert-to-openvino -depth-anything -depth-anything-v2 -detectron2-to-openvino -explainable-ai-1-basic -explainable-ai-2-deep-dive -explainable-ai-3-map-interpretation -fast-segment-anything -grounded-segment-anything -handwritten-ocr -hello-detection -hello-segmentation -hello-world -hugging-face-hub -image-classification-quantization -knowledge-graphs-conve -language-quantize-bert -magika-content-type-recognition -mobileclip-video-search -modelscope-to-openvino -music-generation -named-entity-recognition -nano-llava-multimodal-chatbot -nuextract-structure-extraction -object-detection -openvino-api -openvino-tokenizers -optical-character-recognition -optimize-preprocessing -paddle-ocr-webcam -paddle-to-openvino-classification -person-counting -person-tracking -pix2struct-docvqa -pytorch-onnx-to-openvino -pytorch-to-openvino -291-rmbg-background-removal -siglip-zero-shot-image-classification -softvc-voice-conversion -sparsity-optimization -speculative-sampling -speech-recognition-quantization-wav2vec2 -style-transfer -surya-line-level-text-detection -table-question-answering -tensorflow-classification-to-openvino -tensorflow-hub -tensorflow-instance-segmentation-to-openvino -tensorflow-object-detection-to-openvino -tensorflow-quantization-aware-training -tflite-selfie-segmentation -tflite-to-openvino -tiny-sd-image-generation -vision-background-removal -vision-monodepth -whisper-subtitles-generation -yolov11-instance-segmentation -yolov11-keypoint-detection -yolov11-object-detection -yolov8-instance-segmentation -yolov8-keypoint-detection -yolov8-obb -yolov8-object-detection diff --git a/docs/notebooks/nuextract-structure-extraction-with-output.rst b/docs/notebooks/nuextract-structure-extraction-with-output.rst deleted file mode 100644 index db04a502199cb8..00000000000000 --- a/docs/notebooks/nuextract-structure-extraction-with-output.rst +++ /dev/null @@ -1,495 +0,0 @@ -Structure Extraction with NuExtract and OpenVINO -================================================ - -.. figure:: https://github.com/user-attachments/assets/70dd93cc-da36-4c53-8891-78c0f9a41f20 - :alt: image - - image - -`NuExtract `__ model is a -text-to-JSON Large Language Model (LLM) that allows to extract -arbitrarily complex information from text and turns it into structured -data. - -LLM stands for “Large Language Model” which refers to a type of -artificial intelligence model that is designed to understand and -generate human-like text based on the input it receives. LLMs are -trained on large datasets of text to learn patterns, grammar, and -semantic relationships, allowing them to generate coherent and -contextually relevant responses. One core capability of Large Language -Models (LLMs) is to follow natural language instructions. -Instruction-following models are capable of generating text in response -to prompts and are often used for tasks like writing assistance, -chatbots, and content generation. - -In this tutorial, we consider how to run a structure extraction text -generation pipeline using NuExtract model and OpenVINO. We will use -pre-trained models from the `Hugging Face -Transformers `__ -library. The `Hugging Face Optimum -Intel `__ library -converts the models to OpenVINO™ IR format. To simplify the user -experience, we will use `OpenVINO Generate -API `__ for -generation inference pipeline. - -The tutorial consists of the following steps: - -- Install prerequisites -- Download and convert the model from a public source using the - `OpenVINO integration with Hugging Face - Optimum `__ -- Compress model weights to INT8 and INT4 with `OpenVINO - NNCF `__ -- Create a structure extraction inference pipeline with `Generate - API `__ -- Launch interactive Gradio demo with structure extraction pipeline - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Select model for inference <#select-model-for-inference>`__ -- `Download and convert model to OpenVINO IR via Optimum Intel - CLI <#download-and-convert-model-to-openvino-ir-via-optimum-intel-cli>`__ -- `Compress model weights <#compress-model-weights>`__ - - - `Weights Compression using Optimum Intel - CLI <#weights-compression-using-optimum-intel-cli>`__ - -- `Select device for inference and model - variant <#select-device-for-inference-and-model-variant>`__ -- `Create a structure extraction inference - pipeline <#create-a-structure-extraction-inference-pipeline>`__ -- `Run interactive structure extraction demo with - Gradio <#run-interactive-structure-extraction-demo-with-gradio>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip uninstall -q -y optimum optimum-intel - %pip install -Uq "openvino>=2024.3.0" "openvino-genai" - %pip install -q "torch>=2.1" "nncf>=2.12" "transformers>=4.40.0" "accelerate" "gradio>=4.19" "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu - -.. code:: ipython3 - - import os - from pathlib import Path - import requests - import shutil - - if not Path("notebook_utils.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import download_file - - # Fetch llm_config.py - llm_config_shared_path = Path("../../utils/llm_config.py") - llm_config_dst_path = Path("llm_config.py") - - if not llm_config_dst_path.exists(): - if llm_config_shared_path.exists(): - try: - os.symlink(llm_config_shared_path, llm_config_dst_path) - except Exception: - shutil.copy(llm_config_shared_path, llm_config_dst_path) - else: - download_file(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py") - elif not os.path.islink(llm_config_dst_path): - print("LLM config will be updated") - if llm_config_shared_path.exists(): - shutil.copy(llm_config_shared_path, llm_config_dst_path) - else: - download_file(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py") - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("nuextract-structure-extraction.ipynb") - -Select model for inference --------------------------- - - - -The tutorial supports different models, you can select one from the -provided options to compare the quality of open source solutions. ->\ **Note**: conversion of some models can require additional actions -from user side and at least 64GB RAM for conversion. - -NuExtract model has several versions: - -- **NuExtract-tiny** - This is a version of - `Qwen1.5-0.5 `__ model with - 0.5 billion parameters. More details about the model can be found in - `model card `__. -- **NuExtract** - This is a version of - `phi-3-mini `__ - model with 3.8 billion parameters. More details about the model can - be found in `model card `__. -- **NuExtract-large** - This is a version of - `phi-3-small `__ - model with 7 billion parameters. More details about the model can be - found in `model - card `__. - -All NuExtract models are fine-tuned on a private high-quality synthetic -dataset for information extraction. - -.. code:: ipython3 - - from llm_config import get_llm_selection_widget - - models = { - "NuExtract_tiny": {"model_id": "numind/NuExtract-tiny"}, - "NuExtract": {"model_id": "numind/NuExtract"}, - "NuExtract_large": {"model_id": "numind/NuExtract-large"}, - } - - form, _, model_dropdown, compression_dropdown, _ = get_llm_selection_widget(languages=None, models=models, show_preconverted_checkbox=False) - - form - - - - -.. parsed-literal:: - - Box(children=(Box(children=(Label(value='Model:'), Dropdown(options={'NuExtract_tiny': {'model_id': 'numind/Nu… - - - -.. code:: ipython3 - - model_name = model_dropdown.label - model_config = model_dropdown.value - print(f"Selected model {model_name} with {compression_dropdown.value} compression") - - -.. parsed-literal:: - - Selected model NuExtract_tiny with INT4 compression - - -Download and convert model to OpenVINO IR via Optimum Intel CLI ---------------------------------------------------------------- - - - -Listed model are available for downloading via the `HuggingFace -hub `__. We will use optimum-cli -interface for exporting it into OpenVINO Intermediate Representation -(IR) format. - -Optimum CLI interface for converting models supports export to OpenVINO -(supported starting optimum-intel 1.12 version). General command format: - -.. code:: bash - - optimum-cli export openvino --model --task - -where ``--model`` argument is model id from HuggingFace Hub or local -directory with model (saved using ``.save_pretrained`` method), -``--task`` is one of `supported -task `__ -that exported model should solve. If ``--task`` is not specified, the -task will be auto-inferred based on the model. If model initialization -requires to use remote code, ``--trust-remote-code`` flag additionally -should be passed. Full list of supported arguments available via -``--help`` For more details and examples of usage, please check `optimum -documentation `__. - -Compress model weights ----------------------- - - - -The Weights Compression algorithm is aimed at compressing the weights of -the models and can be used to optimize the model footprint and -performance of large models where the size of weights is relatively -larger than the size of activations, for example, Large Language Models -(LLM). Compared to INT8 compression, INT4 compression improves -performance even more but introduces a minor drop in prediction quality. - -Weights Compression using Optimum Intel CLI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Optimum Intel supports weight compression via NNCF out of the box. For -8-bit compression we pass ``--weight-format int8`` to ``optimum-cli`` -command line. For 4 bit compression we provide ``--weight-format int4`` -and some other options containing number of bits and other compression -parameters. An example of this approach usage you can find in -`llm-chatbot notebook `__ - - **Note**: This tutorial involves conversion model for FP16 and - INT4/INT8 weights compression scenarios. It may be memory and - time-consuming in the first run. You can manually control the - compression precision below. **Note**: There may be no speedup for - INT4/INT8 compressed models on dGPU - -.. code:: ipython3 - - from llm_config import convert_and_compress_model - - model_dir = convert_and_compress_model(model_name, model_config, compression_dropdown.value, use_preconverted=False) - - -.. parsed-literal:: - - ⌛ NuExtract_tiny conversion to INT4 started. It may takes some time. - - - -**Export command:** - - - -``optimum-cli export openvino --model numind/NuExtract-tiny --task text-generation-with-past --weight-format int4 --group-size 128 --ratio 0.8 NuExtract_tiny/INT4_compressed_weights`` - - -.. parsed-literal:: - - Framework not specified. Using pt to export the model. - Using framework PyTorch: 2.3.1+cpu - Overriding 1 configuration item(s) - - use_cache -> True - We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache) - /home/ytarkan/miniconda3/envs/ov_notebooks_env/lib/python3.9/site-packages/optimum/exporters/openvino/model_patcher.py:489: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if sequence_length != 1: - /home/ytarkan/miniconda3/envs/ov_notebooks_env/lib/python3.9/site-packages/transformers/models/qwen2/modeling_qwen2.py:110: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if seq_len > self.max_seq_len_cached: - - -.. parsed-literal:: - - Mixed-Precision assignment ━━━━━━━━━━━━━━━━━━━━ 100% 168/168 • 0:00:01 • 0:00:00• 0:00:01 - [?25hINFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 47% (47 / 169) │ 20% (46 / 168) │ - ├────────────────┼─────────────────────────────┼────────────────────────���───────────────┤ - │ 4 │ 53% (122 / 169) │ 80% (122 / 168) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - Applying Weight Compression ━━━━━━━━━━━━━━━━━━━ 100% 169/169 • 0:00:05 • 0:00:00• 0:00:01 - [?25h - -.. parsed-literal:: - - Set tokenizer padding side to left for `text-generation-with-past` task. - Replacing `(?!\S)` pattern to `(?:$|[^\S])` in RegexSplit operation - - -.. parsed-literal:: - - ✅ INT4 NuExtract_tiny model converted and can be found in NuExtract_tiny/INT4_compressed_weights - - -Let’s compare model size for different compression types - -.. code:: ipython3 - - from llm_config import compare_model_size - - compare_model_size(model_dir) - - -.. parsed-literal:: - - Size of model with INT4 compressed weights is 347.03 MB - - -Select device for inference and model variant ---------------------------------------------- - - - - **Note**: There may be no speedup for INT4/INT8 compressed models on - dGPU. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget(default="CPU", exclude=["NPU"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'GPU', 'AUTO'), value='CPU') - - - -Create a structure extraction inference pipeline ------------------------------------------------- - - - -Firstly we will prepare input prompt for NuExtract model by introducing -``prepare_input()`` function. This function combines the main text, a -JSON schema and optional examples into a single string that adheres to -model’s specific input requirements. - -``prepare_input()`` function accepts the following parameters: 1. -``text``: This is the primary text from which you want to extract -information. 2. ``schema``: A JSON schema string that defines the -structure of the information you want to extract. This acts as a -template, guiding NuExtract model on what data to look for and how to -format the output. 3. ``examples``: An optional list of example strings. -These can be used to provide the model with sample extractions, -potentially improving accuracy for complex or ambiguous cases. - -.. code:: ipython3 - - import json - from typing import List - - - def prepare_input(text: str, schema: str, examples: List[str] = ["", "", ""]) -> str: - schema = json.dumps(json.loads(schema), indent=4) - input_llm = "<|input|>\n### Template:\n" + schema + "\n" - for example in examples: - if example != "": - input_llm += "### Example:\n" + json.dumps(json.loads(example), indent=4) + "\n" - - input_llm += "### Text:\n" + text + "\n<|output|>\n" - return input_llm - -To simplify user experience we will use `OpenVINO Generate -API `__. -We will create pipeline with ``LLMPipeline``. ``LLMPipeline`` is the -main object used for decoding. You can construct it straight away from -the folder with the converted model. It will automatically load the -``main model``, ``tokenizer``, ``detokenizer`` and default -``generation configuration``. After that we will configure parameters -for decoding. We can get default config with -``get_generation_config()``, setup parameters and apply the updated -version with ``set_generation_config(config)`` or put config directly to -``generate()``. It’s also possible to specify the needed options just as -inputs in the ``generate()`` method, as shown below. Then we just run -``generate`` method and get the output in text format. We do not need to -encode input prompt according to model expected template or write -post-processing code for logits decoder, it will be done easily with -LLMPipeline. - -.. code:: ipython3 - - import openvino_genai as ov_genai - - pipe = ov_genai.LLMPipeline(model_dir.as_posix(), device.value) - - - def run_structure_extraction(text: str, schema: str) -> str: - input = prepare_input(text, schema) - return pipe.generate(input, max_new_tokens=200) - -To run structure extraction inference pipeline we need to provide -example text for data extraction and define output structure in a JSON -schema format: - -.. code:: ipython3 - - text = """We introduce Mistral 7B, a 7-billion-parameter language model engineered for - superior performance and efficiency. Mistral 7B outperforms the best open 13B - model (Llama 2) across all evaluated benchmarks, and the best released 34B - model (Llama 1) in reasoning, mathematics, and code generation. Our model - leverages grouped-query attention (GQA) for faster inference, coupled with sliding - window attention (SWA) to effectively handle sequences of arbitrary length with a - reduced inference cost. We also provide a model fine-tuned to follow instructions, - Mistral 7B - Instruct, that surpasses Llama 2 13B - chat model both on human and - automated benchmarks. Our models are released under the Apache 2.0 license. - Code: https://github.com/mistralai/mistral-src - Webpage: https://mistral.ai/news/announcing-mistral-7b/""" - - schema = """{ - "Model": { - "Name": "", - "Number of parameters": "", - "Number of max token": "", - "Architecture": [] - }, - "Usage": { - "Use case": [], - "Licence": "" - } - }""" - - output = run_structure_extraction(text, schema) - print(output) - - -.. parsed-literal:: - - { - "Model": { - "Name": "Mistral 7B", - "Number of parameters": "7-billion", - "Number of max token": "", - "Architecture": [ - "grouped-query attention", - "sliding window attention" - ] - }, - "Usage": { - "Use case": [ - "reasoning", - "mathematics", - "code generation" - ], - "Licence": "Apache 2.0" - } - } - - - -Run interactive structure extraction demo with Gradio ------------------------------------------------------ - - - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/nuextract-structure-extraction/gradio_helper.py" - ) - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=run_structure_extraction) - - try: - demo.launch(height=800) - except Exception: - demo.launch(share=True, height=800) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ - -.. code:: ipython3 - - # Uncomment and run this cell for stopping gradio interface - # demo.close() diff --git a/docs/notebooks/object-detection-with-output.rst b/docs/notebooks/object-detection-with-output.rst deleted file mode 100644 index dafda88f8ca971..00000000000000 --- a/docs/notebooks/object-detection-with-output.rst +++ /dev/null @@ -1,329 +0,0 @@ -Live Object Detection with OpenVINO™ -==================================== - -This notebook demonstrates live object detection with OpenVINO, using -the `Ultralytics -YOLOv8 `__. Final part of -this notebook shows live inference results from a webcam. Additionally, -you can also upload a video file. - - **NOTE**: To use this notebook with a webcam, you need to run the - notebook on a computer with a webcam. If you run the notebook on a - server, the webcam will not work. However, you can still do inference - on a video. - - -**Table of contents:** - - -- `Preparation <#preparation>`__ - - - `Install requirements <#install-requirements>`__ - - `Imports <#imports>`__ - -- `The Model <#the-model>`__ - - - `Download and convert the - Model <#download-and-convert-the-model>`__ - - `Load the Model <#load-the-model>`__ - -- `Processing <#processing>`__ - - - `Main Processing Function <#main-processing-function>`__ - -- `Run <#run>`__ - - - `Run Live Object Detection <#run-live-object-detection>`__ - -- `References <#references>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Preparation ------------ - - - -Install requirements -~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %pip install -q "openvino>=2024.4.0" - %pip install -q "ultralytics==8.3.59" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q opencv-python requests tqdm - - # Fetch `notebook_utils` module - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("object-detection.ipynb") - -Imports -~~~~~~~ - - - -.. code:: ipython3 - - from pathlib import Path - import gc - import openvino as ov - from ultralytics import YOLO - - import notebook_utils as utils - -The Model ---------- - - - -Download and convert the Model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # A directory where the model will be downloaded. - - # The name of the model - model_name = "yolov8n" - - det_model_path = Path(f"{model_name}_openvino_model/{model_name}.xml") - - # export model to OpenVINO format using Ultralytics API - if not det_model_path.exists(): - pt_model = YOLO(f"{model_name}.pt") - pt_model.export(format="openvino", dynamic=True, half=True) - del pt_model - gc.collect() - -Load the Model -~~~~~~~~~~~~~~ - - - -Only a few lines of code are required to run the model. First, -initialize OpenVINO Runtime. Then, read the network architecture and -model weights from the ``.bin`` and ``.xml`` files to compile for the -desired device. If you choose ``GPU`` you need to wait for a while, as -the startup time is much longer than in the case of ``CPU``. - -There is a possibility to let OpenVINO decide which hardware offers the -best performance. For that purpose, just use ``AUTO``. - -.. code:: ipython3 - - core = ov.Core() - - device = utils.device_widget() - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - core = ov.Core() - - - def load_model(det_model_path, device): - compiled_model = compile_model(det_model_path, device) - det_model = YOLO(det_model_path.parent, task="detect") - - if det_model.predictor is None: - custom = {"conf": 0.25, "batch": 1, "save": False, "mode": "predict"} # method defaults - args = {**det_model.overrides, **custom} - det_model.predictor = det_model._smart_load("predictor")(overrides=args, _callbacks=det_model.callbacks) - det_model.predictor.setup_model(model=det_model.model) - - det_model.predictor.model.ov_compiled_model = compiled_model - return det_model - - - def compile_model(det_model_path, device): - det_ov_model = core.read_model(det_model_path) - - ov_config = {} - if device != "CPU": - det_ov_model.reshape({0: [1, 3, 640, 640]}) - if "GPU" in device or ("AUTO" in device and "GPU" in core.available_devices): - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - det_compiled_model = core.compile_model(det_ov_model, device, ov_config) - return det_compiled_model - - - det_model = load_model(det_model_path, device.value) - - -.. parsed-literal:: - - Ultralytics 8.3.0 🚀 Python-3.11.4 torch-2.5.1+cpu CPU (Intel Core(TM) i9-10980XE 3.00GHz) - Loading yolov8n_openvino_model for OpenVINO inference... - Using OpenVINO LATENCY mode for batch=1 inference... - - -Main Processing Function -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Run object detection on the specified source. Either a webcam or a video -file. - -.. code:: ipython3 - - from IPython import display - import cv2 - import numpy as np - - - # Main processing function to run object detection. - def run_object_detection( - source=0, - flip=False, - use_popup=False, - skip_first_frames=0, - ): - player = None - try: - # Create a video player to play with target fps. - player = utils.VideoPlayer(source=source, flip=flip, fps=30, skip_first_frames=skip_first_frames) - # Start capturing. - player.start() - if use_popup: - title = "Press ESC to Exit" - cv2.namedWindow(winname=title, flags=cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE) - - while True: - # Grab the frame. - frame = player.next() - if frame is None: - print("Source ended") - break - # If the frame is larger than full HD, reduce size to improve the performance. - scale = 1280 / max(frame.shape) - if scale < 1: - frame = cv2.resize( - src=frame, - dsize=None, - fx=scale, - fy=scale, - interpolation=cv2.INTER_AREA, - ) - # Get the results. - input_image = np.array(frame) - detections = det_model(input_image, verbose=False) - frame = detections[0].plot() - - # Use this workaround if there is flickering. - if use_popup: - cv2.imshow(winname=title, mat=frame) - key = cv2.waitKey(1) - # escape = 27 - if key == 27: - break - else: - # Encode numpy array to jpg. - _, encoded_img = cv2.imencode(ext=".jpg", img=frame, params=[cv2.IMWRITE_JPEG_QUALITY, 100]) - # Create an IPython image. - i = display.Image(data=encoded_img) - # Display the image in this notebook. - display.clear_output(wait=True) - display.display(i) - # ctrl-c - except KeyboardInterrupt: - print("Interrupted") - # any different error - except RuntimeError as e: - print(e) - finally: - if player is not None: - # Stop capturing. - player.stop() - if use_popup: - cv2.destroyAllWindows() - -Run ---- - - - -Run Live Object Detection -~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Use a webcam as the video input. By default, the primary webcam is set -with ``source=0``. If you have multiple webcams, each one will be -assigned a consecutive number starting at 0. Set ``flip=True`` when -using a front-facing camera. Some web browsers, especially Mozilla -Firefox, may cause flickering. If you experience flickering, set -``use_popup=True``. - - **NOTE**: To use this notebook with a webcam, you need to run the - notebook on a computer with a webcam. If you run the notebook on a - server (for example, Binder), the webcam will not work. Popup mode - may not work if you run this notebook on a remote computer (for - example, Binder). - -If you do not have a webcam, you can still run this demo with a video -file. Any `format supported by -OpenCV `__ -will work. - -Run the object detection: - -.. code:: ipython3 - - USE_WEBCAM = False - - video_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/Coco%20Walking%20in%20Berkeley.mp4" - - video_file = Path("coco.mp4") - - if not USE_WEBCAM and not video_file.exists(): - utils.download_file(video_url, filename=video_file.name) - - cam_id = 0 - - source = cam_id if USE_WEBCAM else video_file - - run_object_detection(source=source, flip=isinstance(source, int), use_popup=False) - - - -.. image:: object-detection-with-output_files/object-detection-with-output_13_0.png - - -.. parsed-literal:: - - Source ended - diff --git a/docs/notebooks/object-detection-with-output_files/object-detection-with-output_13_0.png b/docs/notebooks/object-detection-with-output_files/object-detection-with-output_13_0.png deleted file mode 100644 index 1c548a0445636e..00000000000000 --- a/docs/notebooks/object-detection-with-output_files/object-detection-with-output_13_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b3fca049d156843f0976e9386ece07d1846e92efcacc5c559de140e7733b67b3 -size 171752 diff --git a/docs/notebooks/omnigen-with-output.rst b/docs/notebooks/omnigen-with-output.rst deleted file mode 100644 index 6f8f99f930b29a..00000000000000 --- a/docs/notebooks/omnigen-with-output.rst +++ /dev/null @@ -1,363 +0,0 @@ -Unified image generation using OmniGen and OpenVINO -=================================================== - -OmniGen is a unified image generation model that can generate a wide -range of images from multi-modal prompts. It is designed to be simple, -flexible, and easy to use. Existing image generation models often -require loading several additional network modules (such as ControlNet, -IP-Adapter, Reference-Net, etc.) and performing extra preprocessing -steps (e.g., face detection, pose estimation, cropping, etc.) to -generate a satisfactory image. OmniGen can generate various images -directly through arbitrarily multi-modal instructions without additional -plugins and operations. it can automatically identify the features -(e.g., required object, human pose, depth mapping) in input images -according to the text prompt. - -Here are the illustrations of OmniGen’s capabilities: - -- You can control the image generation flexibly via OmniGen - - .. figure:: https://github.com/VectorSpaceLab/OmniGen/raw/main/imgs/demo_cases.png - :alt: exanple_1.png - - exanple_1.png - -- Referring Expression Generation: You can input multiple images and - use simple, general language to refer to the objects within those - images. OmniGen can automatically recognize the necessary objects in - each image and generate new images based on them. No additional - operations, such as image cropping or face detection, are required. - - .. figure:: https://github.com/VectorSpaceLab/OmniGen/raw/main/imgs/referring.png - :alt: example_2.png - - example_2.png - -You can find more details about a model on `project -page `__, -`paper `__, `original -repository `__. - -This tutorial considers how to run and optimize OmniGen using OpenVINO. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Convert and Optimize model <#convert-and-optimize-model>`__ - - - `Compress model weights <#compress-model-weights>`__ - -- `Prepare inference pipeline <#prepare-inference-pipeline>`__ - - - `Select inference device <#select-inference-device>`__ - -- `Run model inference <#run-model-inference>`__ -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import requests - from pathlib import Path - - helper_files = ["ov_omnigen_helper.py", "gradio_helper.py"] - utility_files = ["notebook_utils.py", "cmd_helper.py"] - - - for utility in utility_files: - if not Path(utility).exists(): - r = requests.get(f"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/{utility}") - with open(utility, "w") as f: - f.write(r.text) - - for helper in helper_files: - if not Path(helper).exists(): - r = requests.get(f"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/omnigen/{helper}") - with open(helper, "w") as f: - f.write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("omnigen.ipynb") - -.. code:: ipython3 - - import platform - - %pip install -q "torch>=2.2" "transformers>=4.45.2" "accelerate>=0.26.1" "diffusers>=0.30.3" "timm" "peft>=0.9.0" "safetensors" "pillow" "opencv-python" "tqdm" "gradio>=4.19" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "openvino>=2024.6.0" "nncf>=2.14" - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - -.. code:: ipython3 - - from cmd_helper import clone_repo - - clone_repo("https://github.com/VectorSpaceLab/OmniGen.git") - - - - -.. parsed-literal:: - - PosixPath('OmniGen') - - - -Convert and Optimize model --------------------------- - - - -For starting work with OpenVINO we should convert models to OpenVINO -Intermediate Representation format first. - -`OpenVINO model conversion -API `__ -should be used for these purposes. ``ov.convert_model`` function accepts -original model instance and example input for tracing and returns -``ov.Model`` representing this model in OpenVINO framework. The -converted model can be used for saving on disk using ``ov.save_model`` -function or directly loading on the device using ``core.complie_model``. - -|image1| Model consists of 2 key components: - -- **Transformer**\ \* is Phi3-based model that gradually denoising - input latents guided by timestep, images and text prompt embeddings. -- **VAE** for encoding input images to latent space and decoding - generation results to image space - -For convenience we will use ``convert_omnigen_model`` function -``ov_omnigen_helper.py``. - -.. |image1| image:: https://vectorspacelab.github.io/OmniGen/img/framework.png - -.. code:: ipython3 - - from ov_omnigen_helper import convert_omingen_model - - # Uncomment the line to see model conversion code - # ??convert_omnigen_model - - -.. parsed-literal:: - - 2024-12-27 12:58:09.587676: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-12-27 12:58:09.600460: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered - WARNING: All log messages before absl::InitializeLog() is called are written to STDERR - E0000 00:00:1735289889.615212 147835 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered - E0000 00:00:1735289889.619665 147835 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered - 2024-12-27 12:58:09.635197: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -Compress model weights -~~~~~~~~~~~~~~~~~~~~~~ - - - -For reducing memory consumption, weights compression optimization can be -applied using `NNCF `__. Weight -compression aims to reduce the memory footprint of a model. It can also -lead to significant performance improvement for large memory-bound -models, such as Large Language Models (LLMs). - -LLMs and other models, which require extensive memory to store the -weights during inference, can benefit from weight compression in the -following ways: - -- enabling the inference of exceptionally large models that cannot be - accommodated in the memory of the device; - -- improving the inference performance of the models by reducing the - latency of the memory access when computing the operations with - weights, for example, Linear layers. - -`Neural Network Compression Framework -(NNCF) `__ provides 4-bit / -8-bit mixed weight quantization as a compression method primarily -designed to optimize LLMs. The main difference between weights -compression and full model quantization (post-training quantization) is -that activations remain floating-point in the case of weights -compression which leads to a better accuracy. Weight compression for -LLMs provides a solid inference performance improvement which is on par -with the performance of the full model quantization. In addition, weight -compression is data-free and does not require a calibration dataset, -making it easy to use. - -``nncf.compress_weights`` function can be used for performing weights -compression. The function accepts an OpenVINO model and other -compression parameters. Compared to INT8 compression, INT4 compression -improves performance even more, but introduces a minor drop in -prediction quality. - -More details about weights compression, can be found in `OpenVINO -documentation `__. - -.. code:: ipython3 - - import nncf - - compression_configuration = { - "mode": nncf.CompressWeightsMode.INT4_ASYM, - "group_size": 64, - "ratio": 1.0, - } - - model_id = "Shitao/OmniGen-v1" - model_path = Path("omnigen_ov") - - convert_omingen_model(model_id, model_path, compression_configuration) - - -.. parsed-literal:: - - Model already converted and can be found in omnigen_ov - - -Prepare inference pipeline --------------------------- - - - -``ov_omnigen_helper.py`` contains ``OVOmniGenPipeline`` that loads -OpenVINO OmniGen models and runs inference using a specified device. Its -usage is similar to the original ``OmniGenPipeline``. - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget("CPU", ["NPU"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -.. code:: ipython3 - - from ov_omnigen_helper import OVOmniGenPipeline - - # Uncomment this line to see model inference pipeline - # ??OVOmniGenPipeliine - -.. code:: ipython3 - - pipe = OVOmniGenPipeline(model_path, device.value) - -Run model inference -------------------- - - - -.. code:: ipython3 - - images = pipe( - prompt="Two cute cats, one of them is ginger and another one is black, quality details, hyper realistic, high definition, photorealistic", - height=320, - width=320, - guidance_scale=2.5, - seed=0, - max_input_image_size=320, - num_inference_steps=25, - ) - images[0] - - - -.. parsed-literal:: - - 0%| | 0/25 [00:00<|image_1|>.", - input_images=["cats.png"], - height=320, - width=320, - guidance_scale=2.5, - img_guidance_scale=1.6, - seed=0, - max_input_image_size=320, - num_inference_steps=30, - ) - images[0] - - - -.. parsed-literal:: - - 0%| | 0/30 [00:00`__, -`paper `__, `original -repo `__ and `model -card `__. In this tutorial -we consider how to run OmniParser using OpenVINO. - - -**Table of contents:** - -- `Prerequisites <#prerequisites>`__ -- `Prepare models <#prepare-models>`__ - - - `Convert models to OpenVINO Intermediate representation - format <#convert-models-to-openvino-intermediate-representation-format>`__ - - - `Icon Detector <#icon-detector>`__ - - `Screen captioning model <#screen-captioning-model>`__ - -- `Run OmniParser using OpenVINO <#run-omniparser-using-openvino>`__ - - - `Icon Detector <#icon-detector>`__ - - - `Select inference device for icon - detector <#select-inference-device-for-icon-detector>`__ - - - `Screen regions captioning <#screen-regions-captioning>`__ - - - `Select device for screen region - captioning <#select-device-for-screen-region-captioning>`__ - - - `Recognition text on the - screen <#recognition-text-on-the-screen>`__ - - - `Select device for OCR <#select-device-for-ocr>`__ - - - `Run model inference <#run-model-inference>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - from pathlib import Path - import requests - import shutil - - notebook_utils_path = Path("notebook_utils.py") - florence_helper_path = Path("ov_florence2_helper.py") - omniparser_helper_path = Path("ov_omniparser_helper.py") - pip_utils_path = Path("pip_helper.py") - - if not notebook_utils_path.exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - notebook_utils_path.open("w", encoding="utf-8").write(r.text) - - if not pip_utils_path.exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/pip_helper.py", - ) - pip_utils_path.open("w", encoding="utf-8").write(r.text) - - if not florence_helper_path.exists(): - if Path("../florence2/ov_florence2_helper.py").exists(): - shutil.copy(Path("../florence2/ov_florence2_helper.py"), florence_helper_path) - else: - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/florence2/ov_florence2_helper.py") - florence_helper_path.open("w", encoding="utf-8").write(r.text) - - if not omniparser_helper_path.exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/omniparser/ov_omniparser_helper.py") - omniparser_helper_path.open("w", encoding="utf-8").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("omniparser.ipynb") - -.. code:: ipython3 - - import platform - from pip_helper import pip_install - - pip_install( - "torch>=2.1", - "torchvision", - "accelerate", - "transformers>=4.45", - "timm", - "einops==0.8.0", - "ultralytics==8.3.59", - "pillow", - "opencv-python", - "gradio>=4.19", - "defusedxml", - "pyyaml", - "scipy", - "scikit-image", - "python-bidi", - "pyclipper", - "Shapely", - "ninja", - "tqdm", - "--extra-index-url", - "https://download.pytorch.org/whl/cpu", - ) - pip_install("--no-deps", "supervision==0.18.0") - pip_install("--no-deps", "easyocr", "--extra-index-url", "https://download.pytorch.org/whl/cpu") - pip_install("-U", "--pre", "--extra-index-url", "https://storage.openvinotoolkit.org/simple/wheels/pre-release", "openvino>=2024.6.0") - - if platform.system() == "Darwin": - pip_install("numpy<2.0") - -Prepare models --------------- - - - -OmniParser leverages a two-step process: 1. Interactable Region -Detection: - Identifies clickable elements like buttons and icons within -a UI. - Employs a specialized model trained on a diverse dataset of web -pages. - Accurately detects interactive elements, even in complex UIs. - -2. Semantic Captioning: - - - Assigns meaningful descriptions to detected elements. - - Combines optical character recognition (OCR) and a captioning - model. - - Provides context for accurate action generation. - -Convert models to OpenVINO Intermediate representation format -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -For starting work with OpenVINO -we should convert models to OpenVINO Intermediate Representation format -first. - -`OpenVINO model conversion -API `__ -should be used for these purposes. ``ov.convert_model`` function accepts -original model instance and example input for tracing and returns -``ov.Model`` representing this model in OpenVINO framework. Converted -model can be used for saving on disk using ``ov.save_model`` function or -directly loading on device using ``core.complie_model``. - -Let’s consider each pipeline part. - -Icon Detector -^^^^^^^^^^^^^ - - - -Icon detector in OmniParser is represented by YOLO based model trained -on curated by model authors interactable icon detection dataset. - -For conversion and model inference we will utilize Ultralytics provided -API. You can find more examples of this API usage in these -`tutorials `__ - -.. code:: ipython3 - - from ov_omniparser_helper import download_omniparser_icon_detector - - icon_detector_dir = download_omniparser_icon_detector() - - -.. parsed-literal:: - - :247: DeprecationWarning: The `openvino.runtime` module is deprecated and will be removed in the 2026.0 release. Please replace `openvino.runtime` with `openvino`. - 2025-01-22 21:12:17.990214: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2025-01-22 21:12:18.002873: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered - WARNING: All log messages before absl::InitializeLog() is called are written to STDERR - E0000 00:00:1737565938.018756 1013181 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered - E0000 00:00:1737565938.023306 1013181 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered - 2025-01-22 21:12:18.039051: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - - - -.. parsed-literal:: - - best.pt: 0%| | 0.00/11.7M [00:00= LooseVersion("1.1.0"): - /home/ea/work/py311/lib/python3.11/site-packages/thop/profile.py:68: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. - if LooseVersion(torch.__version__) >= LooseVersion("1.1.0"): - - -.. parsed-literal:: - - model summary (fused): 168 layers, 3,005,843 parameters, 0 gradients, 8.1 GFLOPs - - PyTorch: starting from 'weights/icon_detect/best.pt' with input shape (1, 3, 640, 640) BCHW and output shape(s) (1, 5, 8400) (11.7 MB) - - OpenVINO: starting export with openvino 2025.1.0-17945-3e8bc27b226... - OpenVINO: export success ✅ 2.0s, saved as 'weights/icon_detect/best_openvino_model/' (6.1 MB) - - Export complete (2.3s) - Results saved to /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/omniparser/weights/icon_detect - Predict: yolo predict task=detect model=weights/icon_detect/best_openvino_model imgsz=640 half - Validate: yolo val task=detect model=weights/icon_detect/best_openvino_model imgsz=640 data=None half - Visualize: https://netron.app - - -Screen captioning model -^^^^^^^^^^^^^^^^^^^^^^^ - - - -The second part of OmniParser pipeline is creating detailed descriptions -of recognized clickable regions. For these purposes pipeline suggests to -use several visual language processing models like BLIP2, Florence2 or -Phi-3-Vision. In this tutorial we will focus on making screen region -captioning using Florence2. Previously we explained in details model -workflow and steps for running it using OpenVINO in this -`tutorial `__. - -.. code:: ipython3 - - from ov_omniparser_helper import download_omniparser_florence_model - - florence_caption_dir = download_omniparser_florence_model() - - - -.. parsed-literal:: - - Fetching 15 files: 0%| | 0/15 [00:00`__ is a python module for -extracting text from image. It is a general OCR that can read both -natural scene text and dense text in document and supports 80+ -languages. EasyOCR utilizes AI for detection text regions and recognize -text inside of predicted regions. We will also utilize both text -detection and recognition models using OpenVINO. - -Select device for OCR -^^^^^^^^^^^^^^^^^^^^^ - - - -.. code:: ipython3 - - import ipywidgets as widgets - - device_detector = device_widget(exclude=["NPU"], description="Detector device:") - device_recognizer = device_widget(exclude=["NPU"], description="Recognizer device:") - - device_box = widgets.VBox([device_detector, device_recognizer]) - device_box - - - - -.. parsed-literal:: - - VBox(children=(Dropdown(description='Detector device:', index=1, options=('CPU', 'AUTO'), value='AUTO'), Dropd… - - - -.. code:: ipython3 - - from ov_omniparser_helper import easyocr_reader - - # Uncomment the line to see easyocr_reader helper code - # ??easyocr_reader - -.. code:: ipython3 - - reader = easyocr_reader("weights/easyocr", device_detector.value, device_recognizer.value) - - -.. parsed-literal:: - - Using CPU. Note: This module is much faster with a GPU. - - - - -.. code:: ipython3 - - from PIL import Image - - test_image_path = Path("examples/windows_home.png") - test_image_path.parent.mkdir(exist_ok=True, parents=True) - - if not test_image_path.exists(): - Image.open(requests.get("https://github.com/microsoft/OmniParser/blob/master/imgs/windows_home.png?raw=true", stream=True).raw).save(test_image_path) - -Run model inference -~~~~~~~~~~~~~~~~~~~ - - - -``process_image`` function defined in ``ov_omniparser_helper.py`` -provides easy-to-use interface for screen parsing process. - -.. code:: ipython3 - - from ov_omniparser_helper import process_image - - # Uncomment this line to see process_image code - # ??process_image - -.. code:: ipython3 - - procesed_image, label_coordinates, icon_descriptions = process_image( - test_image_path, ov_icon_detector, {"model": ov_icon_caption_gen, "processor": processor}, reader - ) - - -.. parsed-literal:: - - - image 1/1 /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/omniparser/examples/windows_home.png: 640x640 32 0s, 55.1ms - Speed: 3.4ms preprocess, 55.1ms inference, 2.2ms postprocess per image at shape (1, 3, 640, 640) - finish processing - - -Function returns image with drawn detected boxes, boxes coordinates and -description for each region. - -.. code:: ipython3 - - display(procesed_image.resize((1200, 1200))) - print(icon_descriptions) - - - -.. image:: omniparser-with-output_files/omniparser-with-output_32_0.png - - -.. parsed-literal:: - - Text Box ID 0: 3.46 PM - Text Box ID 1: Search - Text Box ID 2: Microsoft - Text Box ID 3: 10/25/2024 - Icon Box ID 4: Microsoft Outlook. - Icon Box ID 5: Image - Icon Box ID 6: Microsoft OneNote. - Icon Box ID 7: Microsoft Office. - Icon Box ID 8: a folder for organizing files. - Icon Box ID 9: Microsoft Office. - Icon Box ID 10: Security shield. - Icon Box ID 11: Microsoft 365. - Icon Box ID 12: Microsoft Edge browser. - Icon Box ID 13: Microsoft Edge browser. - Icon Box ID 14: Decrease - Icon Box ID 15: the Windows operating system. - Icon Box ID 16: mountains and a beach. - Icon Box ID 17: a search function. - - -Interactive demo ----------------- - - - -.. code:: ipython3 - - from gradio_helper import make_demo - - - def process_image_gradio(image, box_threshold, iou_threshold, imgsz): - image_result, _, parsed_text = process_image( - image, - ov_icon_detector, - {"model": ov_icon_caption_gen, "processor": processor}, - reader, - box_threshold=box_threshold, - iou_threshold=iou_threshold, - imgsz=imgsz, - ) - return image_result, parsed_text - - - demo = make_demo(process_image_gradio) - - try: - demo.launch(debug=True, height=600) - except Exception: - demo.launch(debug=True, share=True, height=600) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/omniparser-with-output_files/omniparser-with-output_32_0.jpg b/docs/notebooks/omniparser-with-output_files/omniparser-with-output_32_0.jpg deleted file mode 100644 index 513db4e6d0da5d..00000000000000 --- a/docs/notebooks/omniparser-with-output_files/omniparser-with-output_32_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c83af55e4296ff1dadb270b93c31084e983048437f848323c0e9677d2c3ed22 -size 161384 diff --git a/docs/notebooks/omniparser-with-output_files/omniparser-with-output_32_0.png b/docs/notebooks/omniparser-with-output_files/omniparser-with-output_32_0.png deleted file mode 100644 index a09fc0a47cd036..00000000000000 --- a/docs/notebooks/omniparser-with-output_files/omniparser-with-output_32_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:382e19a8751851ad8a151bea1f4f7bc4be62b47c7a8a4f70da0a3dae257b0c20 -size 1411816 diff --git a/docs/notebooks/oneformer-segmentation-with-output.rst b/docs/notebooks/oneformer-segmentation-with-output.rst deleted file mode 100644 index 976ee59067f1ad..00000000000000 --- a/docs/notebooks/oneformer-segmentation-with-output.rst +++ /dev/null @@ -1,745 +0,0 @@ -Universal Segmentation with OneFormer and OpenVINO -================================================== - -This tutorial demonstrates how to use the -`OneFormer `__ model from HuggingFace -with OpenVINO. It describes how to download weights and create PyTorch -model using Hugging Face transformers library, then convert model to -OpenVINO Intermediate Representation format (IR) using OpenVINO Model -Optimizer API and run model inference. Additionally, -`NNCF `__ quantization is -applied to improve OneFormer segmentation speed. - -|image0| - -OneFormer is a follow-up work of -`Mask2Former `__. The latter still -requires training on instance/semantic/panoptic datasets separately to -get state-of-the-art results. - -OneFormer incorporates a text module in the Mask2Former framework, to -condition the model on the respective subtask (instance, semantic or -panoptic). This gives even more accurate results, but comes with a cost -of increased latency, however. - -.. |image0| image:: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/oneformer_architecture.png - - -**Table of contents:** - - -- `Install required libraries <#install-required-libraries>`__ -- `Prepare the environment <#prepare-the-environment>`__ -- `Load OneFormer fine-tuned on COCO for universal - segmentation <#load-oneformer-fine-tuned-on-coco-for-universal-segmentation>`__ -- `Convert the model to OpenVINO IR - format <#convert-the-model-to-openvino-ir-format>`__ -- `Select inference device <#select-inference-device>`__ -- `Choose a segmentation task <#choose-a-segmentation-task>`__ -- `Inference <#inference>`__ -- `Quantization <#quantization>`__ - - - `Preparing calibration dataset <#preparing-calibration-dataset>`__ - - `Run quantization <#run-quantization>`__ - - `Compare model size and - performance <#compare-model-size-and-performance>`__ - -- `Interactive Demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Install required libraries --------------------------- - - - -.. code:: ipython3 - - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "transformers>=4.26.0" "openvino>=2023.1.0" "nncf>=2.7.0" "gradio>=4.19" "torch>=2.1" "matplotlib>=3.4" scipy ipywidgets Pillow tqdm - -Prepare the environment ------------------------ - - - -Import all required packages and set paths for models and constant -variables. - -.. code:: ipython3 - - import warnings - from collections import defaultdict - from pathlib import Path - - from transformers import OneFormerProcessor, OneFormerForUniversalSegmentation - from transformers.models.oneformer.modeling_oneformer import ( - OneFormerForUniversalSegmentationOutput, - ) - import torch - import matplotlib.pyplot as plt - import matplotlib.patches as mpatches - from PIL import Image - from PIL import ImageOps - - import openvino - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("oneformer-segmentation.ipynb") - -.. code:: ipython3 - - IR_PATH = Path("oneformer.xml") - OUTPUT_NAMES = ["class_queries_logits", "masks_queries_logits"] - -Load OneFormer fine-tuned on COCO for universal segmentation ------------------------------------------------------------- - - - -Here we use the ``from_pretrained`` method of -``OneFormerForUniversalSegmentation`` to load the `HuggingFace OneFormer -model `__ -based on Swin-L backbone and trained on -`COCO `__ dataset. - -Also, we use HuggingFace processor to prepare the model inputs from -images and post-process model outputs for visualization. - -.. code:: ipython3 - - processor = OneFormerProcessor.from_pretrained("shi-labs/oneformer_coco_swin_large") - model = OneFormerForUniversalSegmentation.from_pretrained( - "shi-labs/oneformer_coco_swin_large", - ) - id2label = model.config.id2label - -.. code:: ipython3 - - task_seq_length = processor.task_seq_length - shape = (800, 800) - dummy_input = { - "pixel_values": torch.randn(1, 3, *shape), - "task_inputs": torch.randn(1, task_seq_length), - } - -Convert the model to OpenVINO IR format ---------------------------------------- - - - -Convert the PyTorch model to IR format to take advantage of OpenVINO -optimization tools and features. The ``openvino.convert_model`` python -function in OpenVINO Converter can convert the model. The function -returns instance of OpenVINO Model class, which is ready to use in -Python interface. However, it can also be serialized to OpenVINO IR -format for future execution using ``save_model`` function. PyTorch to -OpenVINO conversion is based on TorchScript tracing. HuggingFace models -have specific configuration parameter ``torchscript``, which can be used -for making the model more suitable for tracing. For preparing model. we -should provide PyTorch model instance and example input to -``openvino.convert_model``. - -.. code:: ipython3 - - model.config.torchscript = True - - if not IR_PATH.exists(): - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - model = openvino.convert_model(model, example_input=dummy_input) - openvino.save_model(model, IR_PATH, compress_to_fp16=False) - -Select inference device ------------------------ - - - -Select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - core = openvino.Core() - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -We can prepare the image using the HuggingFace processor. OneFormer -leverages a processor which internally consists of an image processor -(for the image modality) and a tokenizer (for the text modality). -OneFormer is actually a multimodal model, since it incorporates both -images and text to solve image segmentation. - -.. code:: ipython3 - - def prepare_inputs(image: Image.Image, task: str): - """Convert image to model input""" - image = ImageOps.pad(image, shape) - inputs = processor(image, [task], return_tensors="pt") - converted = { - "pixel_values": inputs["pixel_values"], - "task_inputs": inputs["task_inputs"], - } - return converted - -.. code:: ipython3 - - def process_output(d): - """Convert OpenVINO model output to HuggingFace representation for visualization""" - hf_kwargs = {output_name: torch.tensor(d[output_name]) for output_name in OUTPUT_NAMES} - - return OneFormerForUniversalSegmentationOutput(**hf_kwargs) - -.. code:: ipython3 - - # Read the model from files. - model = core.read_model(model=IR_PATH) - # Compile the model. - compiled_model = core.compile_model(model=model, device_name=device.value) - -Model predicts ``class_queries_logits`` of shape -``(batch_size, num_queries)`` and ``masks_queries_logits`` of shape -``(batch_size, num_queries, height, width)``. - -Here we define functions for visualization of network outputs to show -the inference results. - -.. code:: ipython3 - - class Visualizer: - @staticmethod - def extract_legend(handles): - fig = plt.figure() - fig.legend(handles=handles, ncol=len(handles) // 20 + 1, loc="center") - fig.tight_layout() - return fig - - @staticmethod - def predicted_semantic_map_to_figure(predicted_map): - segmentation = predicted_map[0] - # get the used color map - viridis = plt.get_cmap("viridis", max(1, torch.max(segmentation))) - # get all the unique numbers - labels_ids = torch.unique(segmentation).tolist() - fig, ax = plt.subplots() - ax.imshow(segmentation) - ax.set_axis_off() - handles = [] - for label_id in labels_ids: - label = id2label[label_id] - color = viridis(label_id) - handles.append(mpatches.Patch(color=color, label=label)) - fig_legend = Visualizer.extract_legend(handles=handles) - fig.tight_layout() - return fig, fig_legend - - @staticmethod - def predicted_instance_map_to_figure(predicted_map): - segmentation = predicted_map[0]["segmentation"] - segments_info = predicted_map[0]["segments_info"] - # get the used color map - viridis = plt.get_cmap("viridis", max(torch.max(segmentation), 1)) - fig, ax = plt.subplots() - ax.imshow(segmentation) - ax.set_axis_off() - instances_counter = defaultdict(int) - handles = [] - # for each segment, draw its legend - for segment in segments_info: - segment_id = segment["id"] - segment_label_id = segment["label_id"] - segment_label = id2label[segment_label_id] - label = f"{segment_label}-{instances_counter[segment_label_id]}" - instances_counter[segment_label_id] += 1 - color = viridis(segment_id) - handles.append(mpatches.Patch(color=color, label=label)) - - fig_legend = Visualizer.extract_legend(handles) - fig.tight_layout() - return fig, fig_legend - - @staticmethod - def predicted_panoptic_map_to_figure(predicted_map): - segmentation = predicted_map[0]["segmentation"] - segments_info = predicted_map[0]["segments_info"] - # get the used color map - viridis = plt.get_cmap("viridis", max(torch.max(segmentation), 1)) - fig, ax = plt.subplots() - ax.imshow(segmentation) - ax.set_axis_off() - instances_counter = defaultdict(int) - handles = [] - # for each segment, draw its legend - for segment in segments_info: - segment_id = segment["id"] - segment_label_id = segment["label_id"] - segment_label = id2label[segment_label_id] - label = f"{segment_label}-{instances_counter[segment_label_id]}" - instances_counter[segment_label_id] += 1 - color = viridis(segment_id) - handles.append(mpatches.Patch(color=color, label=label)) - - fig_legend = Visualizer.extract_legend(handles) - fig.tight_layout() - return fig, fig_legend - - @staticmethod - def figures_to_images(fig, fig_legend, name_suffix=""): - seg_filename, leg_filename = ( - f"segmentation{name_suffix}.png", - f"legend{name_suffix}.png", - ) - fig.savefig(seg_filename, bbox_inches="tight") - fig_legend.savefig(leg_filename, bbox_inches="tight") - segmentation = Image.open(seg_filename) - legend = Image.open(leg_filename) - return segmentation, legend - -.. code:: ipython3 - - def segment(model, img: Image.Image, task: str): - """ - Apply segmentation on an image. - - Args: - img: Input image. It will be resized to 800x800. - task: String describing the segmentation task. Supported values are: "semantic", "instance" and "panoptic". - Returns: - Tuple[Figure, Figure]: Segmentation map and legend charts. - """ - if img is None: - raise gr.Error("Please load the image or use one from the examples list") - inputs = prepare_inputs(img, task) - outputs = model(inputs) - hf_output = process_output(outputs) - predicted_map = getattr(processor, f"post_process_{task}_segmentation")(hf_output, target_sizes=[img.size[::-1]]) - return getattr(Visualizer, f"predicted_{task}_map_to_figure")(predicted_map) - -.. code:: ipython3 - - if not Path("sample.jpg").exists(): - download_file("http://images.cocodataset.org/val2017/000000439180.jpg", "sample.jpg") - image = Image.open("sample.jpg") - image - - - -.. parsed-literal:: - - sample.jpg: 0%| | 0.00/194k [00:00`__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -The optimization process contains the following steps: 1. Create a -calibration dataset for quantization. 2. Run ``nncf.quantize()`` to -obtain quantized model. 3. Serialize the ``INT8`` model using -``openvino.save_model()`` function. - - Note: Quantization is time and memory consuming operation. Running - quantization code below may take some time. - -Please select below whether you would like to run quantization to -improve model inference speed. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - compiled_quantized_model = None - - to_quantize = quantization_widget(False) - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -Let’s load skip magic extension to skip quantization if to_quantize is -not selected - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - %load_ext skip_kernel_extension - -Preparing calibration dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We use images from -`COCO128 `__ -dataset as calibration samples. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import nncf - import torch.utils.data as data - - from zipfile import ZipFile - - DATA_URL = "https://ultralytics.com/assets/coco128.zip" - OUT_DIR = Path('.') - - - class COCOLoader(data.Dataset): - def __init__(self, images_path): - self.images = list(Path(images_path).iterdir()) - - def __getitem__(self, index): - image = Image.open(self.images[index]) - if image.mode == 'L': - rgb_image = Image.new("RGB", image.size) - rgb_image.paste(image) - image = rgb_image - return image - - def __len__(self): - return len(self.images) - - - def download_coco128_dataset(): - download_file(DATA_URL, directory=OUT_DIR, show_progress=True) - if not (OUT_DIR / "coco128/images/train2017").exists(): - with ZipFile('coco128.zip' , "r") as zip_ref: - zip_ref.extractall(OUT_DIR) - coco_dataset = COCOLoader(OUT_DIR / 'coco128/images/train2017') - return coco_dataset - - - def transform_fn(image): - # We quantize model in panoptic mode because it produces optimal results for both semantic and instance segmentation tasks - inputs = prepare_inputs(image, "panoptic") - return inputs - - - coco_dataset = download_coco128_dataset() - calibration_dataset = nncf.Dataset(coco_dataset, transform_fn) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - - -.. parsed-literal:: - - coco128.zip: 0%| | 0.00/6.66M [00:00`__ -- `Loading a Model <#loading-a-model>`__ - - - `OpenVINO IR Model <#openvino-ir-model>`__ - - `ONNX Model <#onnx-model>`__ - - `PaddlePaddle Model <#paddlepaddle-model>`__ - - `TensorFlow Model <#tensorflow-model>`__ - - `TensorFlow Lite Model <#tensorflow-lite-model>`__ - - `PyTorch Model <#pytorch-model>`__ - -- `Getting Information about a - Model <#getting-information-about-a-model>`__ - - - `Model Inputs <#model-inputs>`__ - - `Model Outputs <#model-outputs>`__ - -- `Doing Inference on a Model <#doing-inference-on-a-model>`__ -- `Reshaping and Resizing <#reshaping-and-resizing>`__ - - - `Change Image Size <#change-image-size>`__ - - `Change Batch Size <#change-batch-size>`__ - -- `Caching a Model <#caching-a-model>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - # Required imports. Please execute this cell first. - %pip install -q "openvino>=2023.1.0" - %pip install -q requests tqdm ipywidgets - - # Fetch `notebook_utils` module - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("openvino-api.ipynb") - -Loading OpenVINO Runtime and Showing Info ------------------------------------------ - - - -Initialize OpenVINO Runtime with ``ov.Core()`` - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - -OpenVINO Runtime can load a network on a device. A device in this -context means a CPU, an Intel GPU, a Neural Compute Stick 2, etc. The -``available_devices`` property shows the available devices in your -system. The “FULL_DEVICE_NAME” option to ``core.get_property()`` shows -the name of the device. - -.. code:: ipython3 - - import openvino.properties as props - - - devices = core.available_devices - - for device in devices: - device_name = core.get_property(device, props.device.full_name) - print(f"{device}: {device_name}") - - -.. parsed-literal:: - - CPU: Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz - - -Select device for inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You can specify which device from available devices will be used for -inference using this widget - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU',), value='CPU') - - - -Loading a Model ---------------- - - - -After initializing OpenVINO Runtime, first read the model file with -``read_model()``, then compile it to the specified device with the -``compile_model()`` method. - -`OpenVINO™ supports several model -formats `__ -and enables developers to convert them to its own OpenVINO IR format -using a tool dedicated to this task. - -OpenVINO IR Model -~~~~~~~~~~~~~~~~~ - - - -An OpenVINO IR (Intermediate Representation) model consists of an -``.xml`` file, containing information about network topology, and a -``.bin`` file, containing the weights and biases binary data. Models in -OpenVINO IR format are obtained by using model conversion API. The -``read_model()`` function expects the ``.bin`` weights file to have the -same filename and be located in the same directory as the ``.xml`` file: -``model_weights_file == Path(model_xml).with_suffix(".bin")``. If this -is the case, specifying the weights file is optional. If the weights -file has a different filename, it can be specified using the ``weights`` -parameter in ``read_model()``. - -The OpenVINO `Model Conversion -API `__ -tool is used to convert models to OpenVINO IR format. Model conversion -API reads the original model and creates an OpenVINO IR model (``.xml`` -and ``.bin`` files) so inference can be performed without delays due to -format conversion. Optionally, model conversion API can adjust the model -to be more suitable for inference, for example, by alternating input -shapes, embedding preprocessing and cutting training parts off. For -information on how to convert your existing TensorFlow, PyTorch or ONNX -model to OpenVINO IR format with model conversion API, refer to the -`tensorflow-to-openvino `__ -and -`pytorch-onnx-to-openvino `__ -notebooks. - -.. code:: ipython3 - - ir_model_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/002-example-models/" - ir_model_name_xml = "classification.xml" - ir_model_name_bin = "classification.bin" - - if not Path("model/" + ir_model_name_xml).exists(): - download_file(ir_model_url + ir_model_name_xml, filename=ir_model_name_xml, directory="model") - download_file(ir_model_url + ir_model_name_bin, filename=ir_model_name_bin, directory="model") - - -.. parsed-literal:: - - 'model/classification.xml' already exists. - 'model/classification.bin' already exists. - - - - -.. parsed-literal:: - - PosixPath('/home/ea/work/openvino_notebooks/notebooks/openvino-api/model/classification.bin') - - - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - classification_model_xml = "model/classification.xml" - - model = core.read_model(model=classification_model_xml) - compiled_model = core.compile_model(model=model, device_name=device.value) - -ONNX Model -~~~~~~~~~~ - - - -`ONNX `__ is an open format built to represent machine -learning models. ONNX defines a common set of operators - the building -blocks of machine learning and deep learning models - and a common file -format to enable AI developers to use models with a variety of -frameworks, tools, runtimes, and compilers. OpenVINO supports reading -models in ONNX format directly,that means they can be used with OpenVINO -Runtime without any prior conversion. - -Reading and loading an ONNX model, which is a single ``.onnx`` file, -works the same way as with an OpenVINO IR model. The ``model`` argument -points to the filename of an ONNX model. - -.. code:: ipython3 - - onnx_model_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/002-example-models/segmentation.onnx" - onnx_model_name = "segmentation.onnx" - - if not Path("model/" + onnx_model_name).exists(): - download_file(onnx_model_url, filename=onnx_model_name, directory="model") - - -.. parsed-literal:: - - 'model/segmentation.onnx' already exists. - - - - -.. parsed-literal:: - - PosixPath('/home/ea/work/openvino_notebooks/notebooks/openvino-api/model/segmentation.onnx') - - - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - onnx_model_path = "model/segmentation.onnx" - - model_onnx = core.read_model(model=onnx_model_path) - compiled_model_onnx = core.compile_model(model=model_onnx, device_name=device.value) - -The ONNX model can be exported to OpenVINO IR with ``save_model()``: - -.. code:: ipython3 - - ov.save_model(model_onnx, output_model="model/exported_onnx_model.xml") - -PaddlePaddle Model -~~~~~~~~~~~~~~~~~~ - - - -`PaddlePaddle `__ -models saved for inference can also be passed to OpenVINO Runtime -without any conversion step. Pass the filename with extension to -``read_model`` and exported an OpenVINO IR with ``save_model`` - -.. code:: ipython3 - - paddle_model_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/002-example-models/" - paddle_model_name = "inference.pdmodel" - paddle_params_name = "inference.pdiparams" - - if not Path("model/" + paddle_model_name).exists() or not Path("model/" + paddle_params_name).exists(): - download_file(paddle_model_url + paddle_model_name, filename=paddle_model_name, directory="model") - download_file( - paddle_model_url + paddle_params_name, - filename=paddle_params_name, - directory="model", - ) - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - paddle_model_path = "model/inference.pdmodel" - - model_paddle = core.read_model(model=paddle_model_path) - compiled_model_paddle = core.compile_model(model=model_paddle, device_name=device.value) - -.. code:: ipython3 - - ov.save_model(model_paddle, output_model="model/exported_paddle_model.xml") - -TensorFlow Model -~~~~~~~~~~~~~~~~ - - - -TensorFlow models saved in frozen graph format can also be passed to -``read_model``. - -.. code:: ipython3 - - pb_model_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/002-example-models/classification.pb" - pb_model_name = "classification.pb" - - if not Path("model/" + pb_model_name).exists(): - download_file(pb_model_url, filename=pb_model_name, directory="model") - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - tf_model_path = "model/classification.pb" - - model_tf = core.read_model(model=tf_model_path) - compiled_model_tf = core.compile_model(model=model_tf, device_name=device.value) - -.. code:: ipython3 - - ov.save_model(model_tf, output_model="model/exported_tf_model.xml") - -TensorFlow Lite Model -~~~~~~~~~~~~~~~~~~~~~ - - - -`TFLite `__ models saved for inference -can also be passed to OpenVINO Runtime. Pass the filename with extension -``.tflite`` to ``read_model`` and exported an OpenVINO IR with -``save_model``. - -This tutorial uses the image classification model -`inception_v4_quant `__. -It is pre-trained model optimized to work with TensorFlow Lite. - -.. code:: ipython3 - - %pip install -q kagglehub - -.. code:: ipython3 - - from pathlib import Path - import kagglehub - - tflite_model_dir = kagglehub.model_download("tensorflow/inception/tfLite/v4-quant") - tflite_model_path = Path(tflite_model_dir) / "1.tflite" - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - - model_tflite = core.read_model(tflite_model_path) - compiled_model_tflite = core.compile_model(model=model_tflite, device_name=device.value) - -.. code:: ipython3 - - ov.save_model(model_tflite, output_model="model/exported_tflite_model.xml") - -PyTorch Model -~~~~~~~~~~~~~ - - - -`PyTorch `__ models can not be directly passed to -``core.read_model``. ``ov.Model`` for model objects from this framework -can be obtained using ``ov.convert_model`` API. You can find more -details in `pytorch-to-openvino `__ notebook. In -this tutorial we will use -`resnet18 `__ -model form torchvision library. After conversion model using -``ov.convert_model``, it can be compiled on device using -``core.compile_model`` or saved on disk for the next usage using -``ov.save_model`` - -.. code:: ipython3 - - %pip install -q "torch>=2.1" torchvision --extra-index-url https://download.pytorch.org/whl/cpu - -.. code:: ipython3 - - import openvino as ov - import torch - from torchvision.models import resnet18, ResNet18_Weights - - core = ov.Core() - - pt_model = resnet18(weights=ResNet18_Weights.IMAGENET1K_V1) - example_input = torch.zeros((1, 3, 224, 224)) - ov_model_pytorch = ov.convert_model(pt_model, example_input=example_input) - - compiled_model_pytorch = core.compile_model(ov_model_pytorch, device_name=device.value) - - ov.save_model(ov_model_pytorch, "model/exported_pytorch_model.xml") - -Getting Information about a Model ---------------------------------- - - - -The OpenVINO Model instance stores information about the model. -Information about the inputs and outputs of the model are in -``model.inputs`` and ``model.outputs``. These are also properties of the -``CompiledModel`` instance. While using ``model.inputs`` and -``model.outputs`` in the cells below, you can also use -``compiled_model.inputs`` and ``compiled_model.outputs``. - -.. code:: ipython3 - - ir_model_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/002-example-models/" - ir_model_name_xml = "classification.xml" - ir_model_name_bin = "classification.bin" - - if not Path("model/" + ir_model_name_xml).exists(): - download_file(ir_model_url + ir_model_name_xml, filename=ir_model_name_xml, directory="model") - download_file(ir_model_url + ir_model_name_bin, filename=ir_model_name_bin, directory="model") - - -.. parsed-literal:: - - 'model/classification.xml' already exists. - 'model/classification.bin' already exists. - - - - -.. parsed-literal:: - - PosixPath('/home/ea/work/openvino_notebooks/notebooks/openvino-api/model/classification.bin') - - - -Model Inputs -~~~~~~~~~~~~ - - - -Information about all input layers is stored in the ``inputs`` -dictionary. - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - classification_model_xml = "model/classification.xml" - model = core.read_model(model=classification_model_xml) - model.inputs - - - - -.. parsed-literal:: - - [] - - - -The cell above shows that the loaded model expects one input with the -name *input*. If you loaded a different model, you may see a different -input layer name, and you may see more inputs. You may also obtain info -about each input layer using ``model.input(index)``, where index is a -numeric index of the input layers in the model. If a model has only one -input, index can be omitted. - -.. code:: ipython3 - - input_layer = model.input(0) - -It is often useful to have a reference to the name of the first input -layer. For a model with one input, ``model.input(0).any_name`` gets this -name. - -.. code:: ipython3 - - input_layer.any_name - - - - -.. parsed-literal:: - - 'input' - - - -The next cell prints the input layout, precision and shape. - -.. code:: ipython3 - - print(f"input precision: {input_layer.element_type}") - print(f"input shape: {input_layer.shape}") - - -.. parsed-literal:: - - input precision: - input shape: [1,3,224,224] - - -This cell shows that the model expects inputs with a shape of -[1,3,224,224], and that this is in the ``NCHW`` layout. This means that -the model expects input data with the batch size of 1 (``N``), 3 -channels (``C``) , and images with a height (``H``) and width (``W``) -equal to 224. The input data is expected to be of ``FP32`` (floating -point) precision. - -Model Outputs -~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - classification_model_xml = "model/classification.xml" - model = core.read_model(model=classification_model_xml) - model.outputs - - - - -.. parsed-literal:: - - [] - - - -Model output info is stored in ``model.outputs``. The cell above shows -that the model returns one output, with the -``MobilenetV3/Predictions/Softmax`` name. Loading a different model will -result in different output layer name, and more outputs might be -returned. Similar to input, you may also obtain information about each -output separately using ``model.output(index)`` - -Since this model has one output, follow the same method as for the input -layer to get its name. - -.. code:: ipython3 - - output_layer = model.output(0) - output_layer.any_name - - - - -.. parsed-literal:: - - 'MobilenetV3/Predictions/Softmax' - - - -Getting the output precision and shape is similar to getting the input -precision and shape. - -.. code:: ipython3 - - print(f"output precision: {output_layer.element_type}") - print(f"output shape: {output_layer.shape}") - - -.. parsed-literal:: - - output precision: - output shape: [1,1001] - - -This cell shows that the model returns outputs with a shape of [1, -1001], where 1 is the batch size (``N``) and 1001 is the number of -classes (``C``). The output is returned as 32-bit floating point. - -Doing Inference on a Model --------------------------- - - - - **NOTE** this notebook demonstrates only the basic synchronous - inference API. For an async inference example, please refer to `Async - API notebook `__ - -The diagram below shows a typical inference pipeline with OpenVINO - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/a91bc582-165b-41a2-ab08-12c812059936 - :alt: image.png - - image.png - -Creating OpenVINO Core and model compilation is covered in the previous -steps. The next step is preparing inputs. You can provide inputs in one -of the supported format: dictionary with name of inputs as keys and -``np.arrays`` that represent input tensors as values, list or tuple of -``np.arrays`` represented input tensors (their order should match with -model inputs order). If a model has a single input, wrapping to a -dictionary or list can be omitted. To do inference on a model, pass -prepared inputs into compiled model object obtained using -``core.compile_model``. The inference result represented as dictionary, -where keys are model outputs and ``np.arrays`` represented their -produced data as values. - -.. code:: ipython3 - - # Install opencv package for image handling - %pip install -q opencv-python - -**Load the network** - -.. code:: ipython3 - - ir_model_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/002-example-models/" - ir_model_name_xml = "classification.xml" - ir_model_name_bin = "classification.bin" - - if not Path("model/" + ir_model_name_xml).exists(): - download_file(ir_model_url + ir_model_name_xml, filename=ir_model_name_xml, directory="model") - download_file(ir_model_url + ir_model_name_bin, filename=ir_model_name_bin, directory="model") - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - classification_model_xml = "model/classification.xml" - model = core.read_model(model=classification_model_xml) - compiled_model = core.compile_model(model=model, device_name=device.value) - input_layer = compiled_model.input(0) - output_layer = compiled_model.output(0) - -**Load an image and convert to the input shape** - -To propagate an image through the network, it needs to be loaded into an -array, resized to the shape that the network expects, and converted to -the input layout of the network. - -.. code:: ipython3 - - import cv2 - - image_filename = Path("data/coco_hollywood.jpg") - if not image_filename.exists(): - image_filename = download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_hollywood.jpg", - directory="data", - ) - image = cv2.imread(str(image_filename)) - image.shape - - -.. parsed-literal:: - - 'data/coco_hollywood.jpg' already exists. - - - - -.. parsed-literal:: - - (663, 994, 3) - - - -The image has a shape of (663,994,3). It is 663 pixels in height, 994 -pixels in width, and has 3 color channels. A reference to the height and -width expected by the network is obtained and the image is resized to -these dimensions. - -.. code:: ipython3 - - # N,C,H,W = batch size, number of channels, height, width. - N, C, H, W = input_layer.shape - # OpenCV resize expects the destination size as (width, height). - resized_image = cv2.resize(src=image, dsize=(W, H)) - resized_image.shape - - - - -.. parsed-literal:: - - (224, 224, 3) - - - -Now, the image has the width and height that the network expects. This -is still in ``HWC`` format and must be changed to ``NCHW`` format. -First, call the ``np.transpose()`` method to change to ``CHW`` and then -add the ``N`` dimension (where ``N``\ = 1) by calling the -``np.expand_dims()`` method. Next, convert the data to ``FP32`` with -``np.astype()`` method. - -.. code:: ipython3 - - import numpy as np - - input_data = np.expand_dims(np.transpose(resized_image, (2, 0, 1)), 0).astype(np.float32) - input_data.shape - - - - -.. parsed-literal:: - - (1, 3, 224, 224) - - - -**Do inference** - -Now that the input data is in the right shape, run inference. The -``CompiledModel`` inference result is a dictionary where keys are the -Output class instances (the same keys in ``compiled_model.outputs`` that -can also be obtained with ``compiled_model.output(index)``) and values - -predicted result in ``np.array`` format. - -.. code:: ipython3 - - # for single input models only - result = compiled_model(input_data)[output_layer] - - # for multiple inputs in a list - result = compiled_model([input_data])[output_layer] - - # or using a dictionary, where the key is input tensor name or index - result = compiled_model({input_layer.any_name: input_data})[output_layer] - -You can also create ``InferRequest`` and run ``infer`` method on -request. - -.. code:: ipython3 - - request = compiled_model.create_infer_request() - request.infer(inputs={input_layer.any_name: input_data}) - result = request.get_output_tensor(output_layer.index).data - -The ``.infer()`` function sets output tensor, that can be reached, using -``get_output_tensor()``. Since this network returns one output, and the -reference to the output layer is in the ``output_layer.index`` -parameter, you can get the data with -``request.get_output_tensor(output_layer.index)``. To get a numpy array -from the output, use the ``.data`` parameter. - -.. code:: ipython3 - - result.shape - - - - -.. parsed-literal:: - - (1, 1001) - - - -The output shape is (1,1001), which is the expected output shape. This -shape indicates that the network returns probabilities for 1001 classes. -To learn more about this notion, refer to the `hello world -notebook `__. - -Reshaping and Resizing ----------------------- - - - -Change Image Size -~~~~~~~~~~~~~~~~~ - - - -Instead of reshaping the image to fit the model, it is also possible to -reshape the model to fit the image. Be aware that not all models support -reshaping, and models that do, may not support all input shapes. The -model accuracy may also suffer if you reshape the model input shape. - -First check the input shape of the model, then reshape it to the new -input shape. - -.. code:: ipython3 - - ir_model_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/002-example-models/" - ir_model_name_xml = "segmentation.xml" - ir_model_name_bin = "segmentation.bin" - - if not Path("model/" + ir_model_name_xml).exists(): - download_file(ir_model_url + ir_model_name_xml, filename=ir_model_name_xml, directory="model") - download_file(ir_model_url + ir_model_name_bin, filename=ir_model_name_bin, directory="model") - - -.. parsed-literal:: - - 'model/segmentation.xml' already exists. - 'model/segmentation.bin' already exists. - - - - -.. parsed-literal:: - - PosixPath('/home/ea/work/openvino_notebooks/notebooks/openvino-api/model/segmentation.bin') - - - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - segmentation_model_xml = "model/segmentation.xml" - segmentation_model = core.read_model(model=segmentation_model_xml) - segmentation_input_layer = segmentation_model.input(0) - segmentation_output_layer = segmentation_model.output(0) - - print("~~~~ ORIGINAL MODEL ~~~~") - print(f"input shape: {segmentation_input_layer.shape}") - print(f"output shape: {segmentation_output_layer.shape}") - - new_shape = ov.PartialShape([1, 3, 544, 544]) - segmentation_model.reshape({segmentation_input_layer.any_name: new_shape}) - segmentation_compiled_model = core.compile_model(model=segmentation_model, device_name=device.value) - # help(segmentation_compiled_model) - print("~~~~ RESHAPED MODEL ~~~~") - print(f"model input shape: {segmentation_input_layer.shape}") - print(f"compiled_model input shape: " f"{segmentation_compiled_model.input(index=0).shape}") - print(f"compiled_model output shape: {segmentation_output_layer.shape}") - - -.. parsed-literal:: - - ~~~~ ORIGINAL MODEL ~~~~ - input shape: [1,3,512,512] - output shape: [1,1,512,512] - ~~~~ RESHAPED MODEL ~~~~ - model input shape: [1,3,544,544] - compiled_model input shape: [1,3,544,544] - compiled_model output shape: [1,1,544,544] - - -The input shape for the segmentation network is [1,3,512,512], with the -``NCHW`` layout: the network expects 3-channel images with a width and -height of 512 and a batch size of 1. Reshape the network with the -``.reshape()`` method of ``IENetwork`` to make it accept input images -with a width and height of 544. This segmentation network always returns -arrays with the input width and height of equal value. Therefore, -setting the input dimensions to 544x544 also modifies the output -dimensions. After reshaping, compile the network once again. - -Change Batch Size -~~~~~~~~~~~~~~~~~ - - - -Use the ``.reshape()`` method to set the batch size, by increasing the -first element of ``new_shape``. For example, to set a batch size of two, -set ``new_shape = (2,3,544,544)`` in the cell above. - -.. code:: ipython3 - - import openvino as ov - - segmentation_model_xml = "model/segmentation.xml" - segmentation_model = core.read_model(model=segmentation_model_xml) - segmentation_input_layer = segmentation_model.input(0) - segmentation_output_layer = segmentation_model.output(0) - new_shape = ov.PartialShape([2, 3, 544, 544]) - segmentation_model.reshape({segmentation_input_layer.any_name: new_shape}) - segmentation_compiled_model = core.compile_model(model=segmentation_model, device_name=device.value) - - print(f"input shape: {segmentation_input_layer.shape}") - print(f"output shape: {segmentation_output_layer.shape}") - - -.. parsed-literal:: - - input shape: [2,3,544,544] - output shape: [2,1,544,544] - - -The output shows that by setting the batch size to 2, the first element -(``N``) of the input and output shape has a value of 2. Propagate the -input image through the network to see the result: - -.. code:: ipython3 - - import numpy as np - import openvino as ov - - core = ov.Core() - segmentation_model_xml = "model/segmentation.xml" - segmentation_model = core.read_model(model=segmentation_model_xml) - segmentation_input_layer = segmentation_model.input(0) - segmentation_output_layer = segmentation_model.output(0) - new_shape = ov.PartialShape([2, 3, 544, 544]) - segmentation_model.reshape({segmentation_input_layer.any_name: new_shape}) - segmentation_compiled_model = core.compile_model(model=segmentation_model, device_name=device.value) - input_data = np.random.rand(2, 3, 544, 544) - - output = segmentation_compiled_model([input_data]) - - print(f"input data shape: {input_data.shape}") - print(f"result data data shape: {segmentation_output_layer.shape}") - - -.. parsed-literal:: - - input data shape: (2, 3, 544, 544) - result data data shape: [2,1,544,544] - - -Caching a Model ---------------- - - - -For some devices, like GPU, loading a model can take some time. Model -Caching solves this issue by caching the model in a cache directory. If -``core.compile_model(model=net, device_name=device_name, config=config_dict)`` -is set, caching will be used. This option checks if a model exists in -the cache. If so, it loads it from the cache. If not, it loads the model -regularly, and stores it in the cache, so that the next time the model -is loaded when this option is set, the model will be loaded from the -cache. - -In the cell below, we create a *model_cache* directory as a subdirectory -of *model*, where the model will be cached for the specified device. The -model will be loaded to the GPU. After running this cell once, the model -will be cached, so subsequent runs of this cell will load the model from -the cache. - -*Note: Model Caching is also available on CPU devices* - -.. code:: ipython3 - - ir_model_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/002-example-models/" - ir_model_name_xml = "classification.xml" - ir_model_name_bin = "classification.bin" - - if not Path("model/" + ir_model_name_xml).exists(): - download_file(ir_model_url + ir_model_name_xml, filename=ir_model_name_xml, directory="model") - download_file(ir_model_url + ir_model_name_bin, filename=ir_model_name_bin, directory="model") - - -.. parsed-literal:: - - 'model/classification.xml' already exists. - 'model/classification.bin' already exists. - - - - -.. parsed-literal:: - - PosixPath('/home/ea/work/openvino_notebooks/notebooks/openvino-api/model/classification.bin') - - - -.. code:: ipython3 - - import time - from pathlib import Path - - import openvino as ov - - core = ov.Core() - - cache_path = Path("model/model_cache") - cache_path.mkdir(exist_ok=True) - # Enable caching for OpenVINO Runtime. To disable caching set enable_caching = False - enable_caching = True - config_dict = {"CACHE_DIR": str(cache_path)} if enable_caching else {} - - classification_model_xml = "model/classification.xml" - model = core.read_model(model=classification_model_xml) - - start_time = time.perf_counter() - compiled_model = core.compile_model(model=model, device_name=device.value, config=config_dict) - end_time = time.perf_counter() - print(f"Loading the network to the {device.value} device took {end_time-start_time:.2f} seconds.") - - -.. parsed-literal:: - - Loading the network to the CPU device took 0.14 seconds. - - -After running the previous cell, we know the model exists in the cache -directory. Then, we delete the compiled model and load it again. Now, we -measure the time it takes now. - -.. code:: ipython3 - - del compiled_model - start_time = time.perf_counter() - compiled_model = core.compile_model(model=model, device_name=device.value, config=config_dict) - end_time = time.perf_counter() - print(f"Loading the network to the {device.value} device took {end_time-start_time:.2f} seconds.") - - -.. parsed-literal:: - - Loading the network to the CPU device took 0.06 seconds. - diff --git a/docs/notebooks/openvino-tokenizers-with-output.rst b/docs/notebooks/openvino-tokenizers-with-output.rst deleted file mode 100644 index 6d8076ceb75d32..00000000000000 --- a/docs/notebooks/openvino-tokenizers-with-output.rst +++ /dev/null @@ -1,570 +0,0 @@ -OpenVINO Tokenizers: Incorporate Text Processing Into OpenVINO Pipelines -======================================================================== - -.. raw:: html - -

- -.. raw:: html - -
- -OpenVINO Tokenizers is an OpenVINO extension and a Python library -designed to streamline tokenizer conversion for seamless integration -into your projects. It supports Python and C++ environments and is -compatible with all major platforms: Linux, Windows, and MacOS. - - -**Table of contents:** - - -- `Tokenization Basics <#tokenization-basics>`__ -- `Acquiring OpenVINO Tokenizers <#acquiring-openvino-tokenizers>`__ - - - `Convert Tokenizer from HuggingFace Hub with CLI - Tool <#convert-tokenizer-from_huggingface-hub-with-cli-tool>`__ - - `Convert Tokenizer from HuggingFace Hub with Python - API <#convert-tokenizer-from-huggingface-hub-with-python-api>`__ - -- `Text Generation Pipeline with OpenVINO - Tokenizers <#text-generation-pipeline-with-openvino-tokenizers>`__ -- `Text Generation Pipeline with OpenVINO GenAI and OpenVINO - Tokenizers <#text-generation-pipeline-with-openvino-genai-and-openvino-tokenizers>`__ -- `Merge Tokenizer into a Model <#merge-tokenizer-into-a-model>`__ -- `Conclusion <#conclusion>`__ -- `Links <#links>`__ - -Tokenization Basics -------------------- - - - -One does not simply put text into a neural network, only numbers. The -process of transforming text into a sequence of numbers is called -**tokenization**. It usually contains several steps that transform the -original string, splitting it into parts - tokens - with an associated -number in a dictionary. You can check the `interactive GPT-4 -tokenizer `__ to gain an -intuitive understanding of the principles of tokenizer work. - -.. raw:: html - -
- -.. raw:: html - -
- -There are two important points in the tokenizer-model relation: 1. Every -neural network with text input is paired with a tokenizer and *cannot be -used without it*. 2. To reproduce the model’s accuracy on a specific -task, it is essential to *utilize the same tokenizer employed during the -model training*. - -That is why almost all model repositories on `HuggingFace -Hub `__ also contain tokenizer files -(``tokenizer.json``, ``vocab.txt``, ``merges.txt``, etc.). - -The process of transforming a sequence of numbers into a string is -called **detokenization**. Detokenizer can share the token dictionary -with a tokenizer, like any LLM chat model, or operate with an entirely -distinct dictionary. For instance, translation models dealing with -different source and target languages often necessitate separate -dictionaries. - -.. raw:: html - -
- -.. raw:: html - -
- -Some tasks only need a tokenizer, like text classification, named entity -recognition, question answering, and feature extraction. On the other -hand, for tasks such as text generation, chat, translation, and -abstractive summarization, both a tokenizer and a detokenizer are -required. - - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Acquiring OpenVINO Tokenizers ------------------------------ - - - -OpenVINO Tokenizers Python library allows you to convert HuggingFace -tokenizers into OpenVINO models. To install all required dependencies -use ``pip install openvino-tokenizers[transformers]``. - -.. code:: ipython3 - - %pip install -Uq pip - %pip install -q -U "openvino>=2024.3.0" openvino-tokenizers[transformers] openvino-genai - %pip install "numpy<2.0.0" "torch>=2.1" --extra-index-url https://download.pytorch.org/whl/cpu - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("openvino-tokenizers.ipynb") - -.. code:: ipython3 - - from pathlib import Path - - - tokenizer_dir = Path("tokenizer/") - model_id = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" - -Convert Tokenizer from HuggingFace Hub with CLI Tool -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The first way is to use the CLI utility, bundled with OpenVINO -Tokenizers. Use ``--with-detokenizer`` flag to add the detokenizer model -to the output. By setting ``--clean-up-tokenization-spaces=False`` we -ensure that the detokenizer correctly decodes a code-generation model -output. ``--trust-remote-code`` flag works the same way as passing -``trust_remote_code=True`` to ``AutoTokenizer.from_pretrained`` -constructor. - -.. code:: ipython3 - - if not tokenizer_dir.exists(): - !convert_tokenizer $model_id --with-detokenizer -o $tokenizer_dir - - -.. parsed-literal:: - - Loading Huggingface Tokenizer... - Converting Huggingface Tokenizer to OpenVINO... - Saved OpenVINO Tokenizer: tokenizer/openvino_tokenizer.xml, tokenizer/openvino_tokenizer.bin - Saved OpenVINO Detokenizer: tokenizer/openvino_detokenizer.xml, tokenizer/openvino_detokenizer.bin - - - ⚠️ If you have any problems with the command above on MacOS, try to - `install tbb `__. - -The result is two OpenVINO models: ``openvino_tokenizer`` and -``openvino_detokenizer``. Both can be interacted with using -``read_model``, ``compile_model`` and ``save_model``, similar to any -other OpenVINO model. - -Convert Tokenizer from HuggingFace Hub with Python API -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The other method is to pass HuggingFace ``hf_tokenizer`` object to -``convert_tokenizer`` function: - -.. code:: ipython3 - - from transformers import AutoTokenizer - from openvino_tokenizers import convert_tokenizer - - - hf_tokenizer = AutoTokenizer.from_pretrained(model_id) - ov_tokenizer, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True) - ov_tokenizer, ov_detokenizer - - - - -.. parsed-literal:: - - ( - ] - outputs[ - , - - ]>, - - ] - outputs[ - - ]>) - - - -That way you get OpenVINO model objects. Use ``save_model`` function -from OpenVINO to reuse converted tokenizers later: - - ⚠️ Import ``openvino_tokenizers`` will add all tokenizer-related - operations to OpenVINO, after which you can work with saved - tokenizers and detokenizers. - -.. code:: ipython3 - - import openvino as ov - - # This import is needed to add all tokenizer-related operations to OpenVINO - import openvino_tokenizers # noqa: F401 - - - ov.save_model(ov_tokenizer, tokenizer_dir / "openvino_tokenizer.xml") - ov.save_model(ov_detokenizer, tokenizer_dir / "openvino_detokenizer.xml") - -To use the tokenizer, compile the converted model and input a list of -strings. It’s essential to be aware that not all original tokenizers -support multiple strings (also called batches) as input. This limitation -arises from the requirement for all resulting number sequences to -maintain the same length. To address this, a padding token must be -specified, which will be appended to shorter tokenized strings. In cases -where no padding token is determined in the original tokenizer, OpenVINO -Tokenizers defaults to using :math:`0` for padding. Presently, *only -right-side padding is supported*, typically used for classification -tasks, but not suitable for text generation. - -.. code:: ipython3 - - tokenizer, detokenizer = ov.compile_model(ov_tokenizer), ov.compile_model(ov_detokenizer) - test_strings = ["Test", "strings"] - - token_ids = tokenizer(test_strings)["input_ids"] - print(f"Token ids: {token_ids}") - - detokenized_text = detokenizer(token_ids)["string_output"] - print(f"Detokenized text: {detokenized_text}") - - -.. parsed-literal:: - - Token ids: [[ 1 4321] - [ 1 6031]] - Detokenized text: ['Test' 'strings'] - - -We can compare the result of converted (de)tokenizer with the original -one: - -.. code:: ipython3 - - hf_token_ids = hf_tokenizer(test_strings).input_ids - print(f"Token ids: {hf_token_ids}") - - hf_detokenized_text = hf_tokenizer.batch_decode(hf_token_ids) - print(f"Detokenized text: {hf_detokenized_text}") - - -.. parsed-literal:: - - Token ids: [[1, 4321], [1, 6031]] - Detokenized text: [' Test', ' strings'] - - -Text Generation Pipeline with OpenVINO Tokenizers -------------------------------------------------- - - - -Let’s build a text generation pipeline with OpenVINO Tokenizers and -minimal dependencies. To obtain an OpenVINO model we will use the -Optimum library. The latest version allows you to get a so-called -`stateful -model `__. - -The original ``TinyLlama-1.1B-intermediate-step-1431k-3T`` model is -4.4Gb. To reduce network and disk usage we will load a converted model -which has also been compressed to ``int8``. The original conversion -command is commented. - -.. code:: ipython3 - - model_dir = Path(Path(model_id).name) - - if not model_dir.exists(): - # Converting the original model - # %pip install -U "git+https://github.com/huggingface/optimum-intel.git" "nncf>=2.8.0" onnx - # %optimum-cli export openvino -m $model_id --task text-generation-with-past $model_dir - - # Load already converted model - from huggingface_hub import hf_hub_download - - hf_hub_download( - "chgk13/TinyLlama-1.1B-intermediate-step-1431k-3T", - filename="openvino_model.xml", - local_dir=model_dir, - ) - hf_hub_download( - "chgk13/TinyLlama-1.1B-intermediate-step-1431k-3T", - filename="openvino_model.bin", - local_dir=model_dir, - ) - -.. code:: ipython3 - - import numpy as np - from tqdm.notebook import trange - from pathlib import Path - from openvino_tokenizers.constants import EOS_TOKEN_ID_NAME - - - core = ov.Core() - - ov_model = core.read_model(model_dir / "openvino_model.xml") - compiled_model = core.compile_model(ov_model) - infer_request = compiled_model.create_infer_request() - -The ``infer_request`` object provides control over the model’s state - a -Key-Value cache that speeds up inference by reducing computations. -Multiple inference requests can be created, and each request maintains a -distinct and separate state. - -.. code:: ipython3 - - text_input = ["Quick brown fox jumped"] - - model_input = {name.any_name: output for name, output in tokenizer(text_input).items()} - - if "position_ids" in (input.any_name for input in infer_request.model_inputs): - model_input["position_ids"] = np.arange(model_input["input_ids"].shape[1], dtype=np.int64)[np.newaxis, :] - - # No beam search, set idx to 0 - model_input["beam_idx"] = np.array([0], dtype=np.int32) - - # End of sentence token is that model signifies the end of text generation - # Read EOS token ID from rt_info of tokenizer/detokenizer ov.Model object - eos_token = ov_tokenizer.get_rt_info(EOS_TOKEN_ID_NAME).value - - tokens_result = np.array([[]], dtype=np.int64) - - # Reset KV cache inside the model before inference - infer_request.reset_state() - max_infer = 5 - - for _ in trange(max_infer): - infer_request.start_async(model_input) - infer_request.wait() - - output_tensor = infer_request.get_output_tensor() - - # Get the most probable token - token_indices = np.argmax(output_tensor.data, axis=-1) - output_token = token_indices[:, -1:] - - # Concatenate previous tokens result with newly generated token - tokens_result = np.hstack((tokens_result, output_token)) - if output_token[0, 0] == eos_token: - break - - # Prepare input for the next inference iteration - model_input["input_ids"] = output_token - model_input["attention_mask"] = np.hstack((model_input["attention_mask"].data, [[1]])) - model_input["position_ids"] = np.hstack( - ( - model_input["position_ids"].data, - [[model_input["position_ids"].data.shape[-1]]], - ) - ) - - - text_result = detokenizer(tokens_result)["string_output"] - print(f"Prompt:\n{text_input[0]}") - print(f"Generated:\n{text_result[0]}") - - - -.. parsed-literal:: - - 0%| | 0/5 [00:00`__ -is a flavor of OpenVINO, aiming to simplify running inference of -generative AI models. It hides the complexity of the generation process -and minimizes the amount of code required. OpenVINO GenAI depends on -`OpenVINO `__ and `OpenVINO -Tokenizers `__. - -Firstly we need to create a pipeline with ``LLMPipeline``. -``LLMPipeline`` is the main object used for text generation using LLM in -OpenVINO GenAI API. You can construct it straight away from the folder -where both converted model and tokenizer are located, -e.g. ``ov_genai.LLMPipeline(model_and_tokenizer_path)``. - -As the model and tokenizer are located in different directories, we -create a ``ov_genai.Tokenizer`` object by providing the path to saved -tokenizer. Then we will provide directory with model, tokenizer object -and device for ``LLMPipeline``. Lastly we run ``generate`` method and -get the output in text format. - -Additionally, we can configure parameters for decoding. We can get the -default config with ``get_generation_config()``, setup parameters, and -apply the updated version with ``set_generation_config(config)`` or put -config directly to ``generate()``. It’s also possible to specify the -needed options just as inputs in the ``generate()`` method, as shown -below, e.g. we can add ``max_new_tokens`` to stop generation if a -specified number of tokens is generated and the end of generation is not -reached. - -Let’s build the same text generation pipeline, but with simplified -Python `OpenVINO Generate -API `__. -We will use the same model and tokenizer downloaded in previous steps. - -.. code:: ipython3 - - import openvino_genai as ov_genai - - genai_tokenizer = ov_genai.Tokenizer(str(tokenizer_dir)) - pipe = ov_genai.LLMPipeline(str(model_dir), genai_tokenizer, "CPU") - - result = pipe.generate(text_input[0], max_new_tokens=max_infer) - - print(f"Prompt:\n{text_input[0]}") - print(f"Generated:\n{result}") - - -.. parsed-literal:: - - Prompt: - Quick brown fox jumped - Generated: - over the lazy dog. - - -Merge Tokenizer into a Model ----------------------------- - - - -Packages like ``tensorflow-text`` offer the convenience of integrating -text processing directly into the model, streamlining both distribution -and usage. Similarly, with OpenVINO Tokenizers, you can create models -that combine a converted tokenizer and a model. It’s important to note -that not all scenarios benefit from this merge. In cases where a -tokenizer is used once and a model is inferred multiple times, as seen -in the earlier text generation example, maintaining a separate -(de)tokenizer and model is advisable to prevent unnecessary -tokenization-detokenization cycles during inference. Conversely, if both -a tokenizer and a model are used once in each pipeline inference, -merging simplifies the workflow and aids in avoiding the creation of -intermediate objects: - -.. raw:: html - -
- -.. raw:: html - -
- -The OpenVINO Python API allows you to avoid this by using the -``share_inputs`` option during inference, but it requires additional -input from a developer every time the model is inferred. Combining the -models and tokenizers simplifies memory management. Moreover, after the -combining models inputs have changed - original model has three inputs -(``input_ids``, ``attention_mask``, ``token_type_ids``) and combined -model has only one input for text input prompt. - -.. code:: ipython3 - - model_id = "mrm8488/bert-tiny-finetuned-sms-spam-detection" - model_dir = Path(Path(model_id).name) - - if not model_dir.exists(): - %pip install -qU git+https://github.com/huggingface/optimum-intel.git - !optimum-cli export openvino --model $model_id --task text-classification $model_dir - -.. code:: ipython3 - - from openvino_tokenizers import connect_models - - - core = ov.Core() - text_input = ["Free money!!!"] - - ov_tokenizer = core.read_model(model_dir / "openvino_tokenizer.xml") - ov_model = core.read_model(model_dir / "openvino_model.xml") - combined_model = connect_models(ov_tokenizer, ov_model) - ov.save_model(combined_model, model_dir / "combined_openvino_model.xml") - - print("Original OpenVINO model inputs:") - for input in ov_model.inputs: - print(input) - - print("\nCombined OpenVINO model inputs:") - for input in combined_model.inputs: - print(input) - - compiled_combined_model = core.compile_model(combined_model) - openvino_output = compiled_combined_model(text_input) - - print(f"\nLogits: {openvino_output['logits']}") - - -.. parsed-literal:: - - Original OpenVINO model inputs: - - - - - Combined OpenVINO model inputs: - - - Logits: [[ 1.2007061 -1.469803 ]] - - -Conclusion ----------- - - - -The OpenVINO Tokenizers integrate text processing operations into the -OpenVINO ecosystem. Enabling the conversion of HuggingFace tokenizers -into OpenVINO models, the library allows efficient deployment of deep -learning pipelines across varied environments. The feature of combining -tokenizers and models not only simplifies memory management but also -helps to streamline model usage and deployment. - -Links ------ - - - -- `Installation instructions for different - environments `__ -- `Supported Tokenizer - Types `__ -- `OpenVINO.GenAI repository with the C++ example of OpenVINO - Tokenizers - usage `__ -- `HuggingFace Tokenizers Comparison - Table `__ diff --git a/docs/notebooks/openvoice-with-output.rst b/docs/notebooks/openvoice-with-output.rst deleted file mode 100644 index 2fe77cb30c511f..00000000000000 --- a/docs/notebooks/openvoice-with-output.rst +++ /dev/null @@ -1,1067 +0,0 @@ -Voice tone cloning with OpenVoice and OpenVINO -============================================== - -OpenVoice is a versatile instant voice tone transferring and generating -speech in various languages with just a brief audio snippet from the -source speaker. OpenVoice has three main features: (i) high quality tone -color replication with multiple languages and accents; (ii) it provides -fine-tuned control over voice styles, including emotions, accents, as -well as other parameters such as rhythm, pauses, and intonation. (iii) -OpenVoice achieves zero-shot cross-lingual voice cloning, eliminating -the need for the generated speech and the reference speech to be part of -a massive-speaker multilingual training dataset. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/5703039/ca7eab80-148d-45b0-84e8-a5a279846b51 - :alt: image - - image - -More details about model can be found in `project web -page `__, -`paper `__, and official -`repository `__ - -This notebook provides example of converting `PyTorch OpenVoice -model `__ to OpenVINO IR. In -this tutorial we will explore how to convert and run OpenVoice using -OpenVINO. - - -**Table of contents:** - - -- `Clone repository and install - requirements <#clone-repository-and-install-requirements>`__ -- `Download checkpoints and load PyTorch - model <#download-checkpoints-and-load-pytorch-model>`__ -- `Convert models to OpenVINO IR <#convert-models-to-openvino-ir>`__ -- `Inference <#inference>`__ - - - `Select inference device <#select-inference-device>`__ - - `Select reference tone <#select-reference-tone>`__ - - `Run inference <#run-inference>`__ - -- `Run OpenVoice Gradio interactive - demo <#run-openvoice-gradio-interactive-demo>`__ -- `Cleanup <#cleanup>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Clone repository and install requirements ------------------------------------------ - - - -.. code:: ipython3 - - # Fetch `notebook_utils` module - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - if not Path("pip_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/pip_helper.py", - ) - open("pip_helper.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("openvoice.ipynb") - -.. code:: ipython3 - - from pathlib import Path - - from cmd_helper import clone_repo - from pip_helper import pip_install - import platform - - - repo_dir = Path("OpenVoice") - - if not repo_dir.exists(): - clone_repo("https://github.com/myshell-ai/OpenVoice") - orig_english_path = Path("OpenVoice/openvoice/text/_orig_english.py") - english_path = Path("OpenVoice/openvoice/text/english.py") - - english_path.rename(orig_english_path) - - with orig_english_path.open("r") as f: - data = f.read() - data = data.replace("unidecode", "anyascii") - with english_path.open("w") as out_f: - out_f.write(data) - - - # fix a problem with silero downloading and installing - with Path("OpenVoice/openvoice/se_extractor.py").open("r") as orig_file: - data = orig_file.read() - data = data.replace('method="silero"', 'method="silero:3.0"') - with Path("OpenVoice/openvoice/se_extractor.py").open("w") as out_f: - out_f.write(data) - - - pip_install("librosa>=0.8.1", "pydub>=0.25.1", "tqdm", "inflect>=7.0.0", "pypinyin>=0.50.0", "openvino>=2023.3", "gradio>=4.15") - pip_install( - "--extra-index-url", - "https://download.pytorch.org/whl/cpu", - "wavmark>=0.0.3", - "faster-whisper>=0.9.0", - "eng_to_ipa>=0.0.2", - "cn2an>=0.5.22", - "jieba>=0.42.1", - "langid>=1.1.6", - "ipywebrtc", - "anyascii", - "torch>=2.1", - "nncf>=2.11.0", - "dtw-python", - "more-itertools", - "tiktoken", - ) - pip_install("--no-deps", "whisper-timestamped>=1.14.2", "openai-whisper") - - if platform.system() == "Darwin": - pip_install("numpy<2.0") - -Download checkpoints and load PyTorch model -------------------------------------------- - - - -.. code:: ipython3 - - import os - import torch - import openvino as ov - import ipywidgets as widgets - from IPython.display import Audio - from notebook_utils import download_file, device_widget - - core = ov.Core() - - from openvoice.api import BaseSpeakerTTS, ToneColorConverter, OpenVoiceBaseClass - import openvoice.se_extractor as se_extractor - - -.. parsed-literal:: - - Importing the dtw module. When using in academic works please cite: - T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package. - J. Stat. Soft., doi:10.18637/jss.v031.i07. - - - -.. code:: ipython3 - - CKPT_BASE_PATH = Path("checkpoints") - - en_suffix = CKPT_BASE_PATH / "base_speakers/EN" - zh_suffix = CKPT_BASE_PATH / "base_speakers/ZH" - converter_suffix = CKPT_BASE_PATH / "converter" - -To make notebook lightweight by default model for Chinese speech is not -activated, in order turn on please set flag ``enable_chinese_lang`` to -True - -.. code:: ipython3 - - enable_chinese_lang = False - -.. code:: ipython3 - - def download_from_hf_hub(filename, local_dir="./"): - from huggingface_hub import hf_hub_download - - local_path = Path(local_dir) - local_path.mkdir(exist_ok=True) - hf_hub_download(repo_id="myshell-ai/OpenVoice", filename=filename, local_dir=local_path) - - - download_from_hf_hub(f"{converter_suffix.as_posix()}/checkpoint.pth") - download_from_hf_hub(f"{converter_suffix.as_posix()}/config.json") - download_from_hf_hub(f"{en_suffix.as_posix()}/checkpoint.pth") - download_from_hf_hub(f"{en_suffix.as_posix()}/config.json") - - download_from_hf_hub(f"{en_suffix.as_posix()}/en_default_se.pth") - download_from_hf_hub(f"{en_suffix.as_posix()}/en_style_se.pth") - - if enable_chinese_lang: - download_from_hf_hub(f"{zh_suffix.as_posix()}/checkpoint.pth") - download_from_hf_hub(f"{zh_suffix.as_posix()}/config.json") - download_from_hf_hub(f"{zh_suffix.as_posix()}/zh_default_se.pth") - -.. code:: ipython3 - - pt_device = "cpu" - - en_base_speaker_tts = BaseSpeakerTTS(en_suffix / "config.json", device=pt_device) - en_base_speaker_tts.load_ckpt(en_suffix / "checkpoint.pth") - - tone_color_converter = ToneColorConverter(converter_suffix / "config.json", device=pt_device) - tone_color_converter.load_ckpt(converter_suffix / "checkpoint.pth") - - if enable_chinese_lang: - zh_base_speaker_tts = BaseSpeakerTTS(zh_suffix / "config.json", device=pt_device) - zh_base_speaker_tts.load_ckpt(zh_suffix / "checkpoint.pth") - else: - zh_base_speaker_tts = None - - -.. parsed-literal:: - - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py:28: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm. - warnings.warn("torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.") - - -.. parsed-literal:: - - Loaded checkpoint 'checkpoints/base_speakers/EN/checkpoint.pth' - missing/unexpected keys: [] [] - Loaded checkpoint 'checkpoints/converter/checkpoint.pth' - missing/unexpected keys: [] [] - - -Convert models to OpenVINO IR ------------------------------ - - - -There are 2 models in OpenVoice: first one is responsible for speech -generation ``BaseSpeakerTTS`` and the second one ``ToneColorConverter`` -imposes arbitrary voice tone to the original speech. To convert to -OpenVino IR format first we need to get acceptable ``torch.nn.Module`` -object. Both ToneColorConverter, BaseSpeakerTTS instead of using -``self.forward`` as the main entry point use custom ``infer`` and -``convert_voice`` methods respectively, therefore need to wrap them with -a custom class that is inherited from torch.nn.Module. - -.. code:: ipython3 - - class OVOpenVoiceBase(torch.nn.Module): - """ - Base class for both TTS and voice tone conversion model: constructor is same for both of them. - """ - - def __init__(self, voice_model: OpenVoiceBaseClass): - super().__init__() - self.voice_model = voice_model - for par in voice_model.model.parameters(): - par.requires_grad = False - - - class OVOpenVoiceTTS(OVOpenVoiceBase): - """ - Constructor of this class accepts BaseSpeakerTTS object for speech generation and wraps it's 'infer' method with forward. - """ - - def get_example_input(self): - stn_tst = self.voice_model.get_text("this is original text", self.voice_model.hps, False) - x_tst = stn_tst.unsqueeze(0) - x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) - speaker_id = torch.LongTensor([1]) - noise_scale = torch.tensor(0.667) - length_scale = torch.tensor(1.0) - noise_scale_w = torch.tensor(0.6) - return ( - x_tst, - x_tst_lengths, - speaker_id, - noise_scale, - length_scale, - noise_scale_w, - ) - - def forward(self, x, x_lengths, sid, noise_scale, length_scale, noise_scale_w): - return self.voice_model.model.infer(x, x_lengths, sid, noise_scale, length_scale, noise_scale_w) - - - class OVOpenVoiceConverter(OVOpenVoiceBase): - """ - Constructor of this class accepts ToneColorConverter object for voice tone conversion and wraps it's 'voice_conversion' method with forward. - """ - - def get_example_input(self): - y = torch.randn([1, 513, 238], dtype=torch.float32) - y_lengths = torch.LongTensor([y.size(-1)]) - target_se = torch.randn(*(1, 256, 1)) - source_se = torch.randn(*(1, 256, 1)) - tau = torch.tensor(0.3) - return (y, y_lengths, source_se, target_se, tau) - - def forward(self, y, y_lengths, sid_src, sid_tgt, tau): - return self.voice_model.model.voice_conversion(y, y_lengths, sid_src, sid_tgt, tau) - -Convert to OpenVino IR and save to IRs_path folder for the future use. -If IRs already exist skip conversion and read them directly - -For reducing memory consumption, weights compression optimization can be -applied using `NNCF `__. Weight -compression aims to reduce the memory footprint of a model. models, -which require extensive memory to store the weights during inference, -can benefit from weight compression in the following ways: - -- enabling the inference of exceptionally large models that cannot be - accommodated in the memory of the device; - -- improving the inference performance of the models by reducing the - latency of the memory access when computing the operations with - weights, for example, Linear layers. - -`Neural Network Compression Framework -(NNCF) `__ provides 4-bit / -8-bit mixed weight quantization as a compression method. The main -difference between weights compression and full model quantization -(post-training quantization) is that activations remain floating-point -in the case of weights compression which leads to a better accuracy. In -addition, weight compression is data-free and does not require a -calibration dataset, making it easy to use. - -``nncf.compress_weights`` function can be used for performing weights -compression. The function accepts an OpenVINO model and other -compression parameters. - -More details about weights compression can be found in `OpenVINO -documentation `__. - -.. code:: ipython3 - - import nncf - - - IRS_PATH = Path("openvino_irs/") - EN_TTS_IR = IRS_PATH / "openvoice_en_tts.xml" - ZH_TTS_IR = IRS_PATH / "openvoice_zh_tts.xml" - VOICE_CONVERTER_IR = IRS_PATH / "openvoice_tone_conversion.xml" - - paths = [EN_TTS_IR, VOICE_CONVERTER_IR] - models = [ - OVOpenVoiceTTS(en_base_speaker_tts), - OVOpenVoiceConverter(tone_color_converter), - ] - if enable_chinese_lang: - models.append(OVOpenVoiceTTS(zh_base_speaker_tts)) - paths.append(ZH_TTS_IR) - ov_models = [] - - for model, path in zip(models, paths): - if not path.exists(): - ov_model = ov.convert_model(model, example_input=model.get_example_input()) - ov_model = nncf.compress_weights(ov_model) - ov.save_model(ov_model, path) - else: - ov_model = core.read_model(path) - ov_models.append(ov_model) - - ov_en_tts, ov_voice_conversion = ov_models[:2] - if enable_chinese_lang: - ov_zh_tts = ov_models[-1] - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - this is original text. - length:22 - length:21 - - -.. parsed-literal:: - - /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/openvoice/OpenVoice/openvoice/attentions.py:283: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert ( - /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/openvoice/OpenVoice/openvoice/attentions.py:346: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - pad_length = max(length - (self.window_size + 1), 0) - /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/openvoice/OpenVoice/openvoice/attentions.py:347: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - slice_start_position = max((self.window_size + 1) - length, 0) - /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/openvoice/OpenVoice/openvoice/attentions.py:349: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if pad_length > 0: - /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/openvoice/OpenVoice/openvoice/transforms.py:114: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if torch.min(inputs) < left or torch.max(inputs) > right: - /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/openvoice/OpenVoice/openvoice/transforms.py:119: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if min_bin_width * num_bins > 1.0: - /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/openvoice/OpenVoice/openvoice/transforms.py:121: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if min_bin_height * num_bins > 1.0: - /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/openvoice/OpenVoice/openvoice/transforms.py:171: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert (discriminant >= 0).all() - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/torch/jit/_trace.py:1116: TracerWarning: Trace had nondeterministic nodes. Did you forget call .eval() on your model? Nodes: - %3293 : Float(1, 2, 43, strides=[86, 43, 1], requires_grad=0, device=cpu) = aten::randn(%3288, %3289, %3290, %3291, %3292) # /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:86:0 - %5559 : Float(1, 192, 151, strides=[28992, 1, 192], requires_grad=0, device=cpu) = aten::randn_like(%m_p, %5554, %5555, %5556, %5557, %5558) # /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:86:0 - This may cause errors in trace checking. To disable trace checking, pass check_trace=False to torch.jit.trace() - _check_trace( - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/torch/jit/_trace.py:1116: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error: - The values for attribute 'shape' do not match: torch.Size([1, 1, 39168]) != torch.Size([1, 1, 39424]). - _check_trace( - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/torch/jit/_trace.py:1116: TracerWarning: Output nr 2. of the traced function does not match the corresponding output of the Python function. Detailed error: - The values for attribute 'shape' do not match: torch.Size([1, 1, 153, 43]) != torch.Size([1, 1, 154, 43]). - _check_trace( - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/torch/jit/_trace.py:1116: TracerWarning: Output nr 3. of the traced function does not match the corresponding output of the Python function. Detailed error: - The values for attribute 'shape' do not match: torch.Size([1, 1, 153]) != torch.Size([1, 1, 154]). - _check_trace( - 2024-07-31 19:08:02.879488: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (199 / 199) │ 100% (199 / 199) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11. - - -.. parsed-literal:: - - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/torch/jit/_trace.py:1116: TracerWarning: Trace had nondeterministic nodes. Did you forget call .eval() on your model? Nodes: - %1596 : Float(1, 192, 238, strides=[91392, 238, 1], requires_grad=0, device=cpu) = aten::randn_like(%m, %1591, %1592, %1593, %1594, %1595) # /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:86:0 - This may cause errors in trace checking. To disable trace checking, pass check_trace=False to torch.jit.trace() - _check_trace( - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/torch/jit/_trace.py:1116: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error: - Tensor-likes are not close! - - Mismatched elements: 18501 / 60928 (30.4%) - Greatest absolute difference: 0.010825839242897928 at index (0, 0, 27067) (up to 1e-05 allowed) - Greatest relative difference: 9452.158227848102 at index (0, 0, 45473) (up to 1e-05 allowed) - _check_trace( - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (194 / 194) │ 100% (194 / 194) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -Inference ---------- - - - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - core = ov.Core() - - device = device_widget() - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Select reference tone -~~~~~~~~~~~~~~~~~~~~~ - - - -First of all, select the reference tone of voice to which the generated -text will be converted: your can select from existing ones, record your -own by selecting ``record_manually`` or upload you own file by -``load_manually`` - -.. code:: ipython3 - - REFERENCE_VOICES_PATH = f"{repo_dir}/resources/" - reference_speakers = [ - *[path for path in os.listdir(REFERENCE_VOICES_PATH) if os.path.splitext(path)[-1] == ".mp3"], - "record_manually", - "load_manually", - ] - - ref_speaker = widgets.Dropdown( - options=reference_speakers, - value=reference_speakers[0], - description="reference voice from which tone color will be copied", - disabled=False, - ) - - ref_speaker - - - - -.. parsed-literal:: - - Dropdown(description='reference voice from which tone color will be copied', options=('demo_speaker2.mp3', 'ex… - - - -.. code:: ipython3 - - OUTPUT_DIR = Path("outputs/") - OUTPUT_DIR.mkdir(exist_ok=True) - -.. code:: ipython3 - - ref_speaker_path = f"{REFERENCE_VOICES_PATH}/{ref_speaker.value}" - allowed_audio_types = ".mp4,.mp3,.wav,.wma,.aac,.m4a,.m4b,.webm" - - if ref_speaker.value == "record_manually": - ref_speaker_path = OUTPUT_DIR / "custom_example_sample.webm" - from ipywebrtc import AudioRecorder, CameraStream - - camera = CameraStream(constraints={"audio": True, "video": False}) - recorder = AudioRecorder(stream=camera, filename=ref_speaker_path, autosave=True) - display(recorder) - elif ref_speaker.value == "load_manually": - upload_ref = widgets.FileUpload( - accept=allowed_audio_types, - multiple=False, - description="Select audio with reference voice", - ) - display(upload_ref) - -Play the reference voice sample before cloning it’s tone to another -speech - -.. code:: ipython3 - - def save_audio(voice_source: widgets.FileUpload, out_path: str): - with open(out_path, "wb") as output_file: - assert len(voice_source.value) > 0, "Please select audio file" - output_file.write(voice_source.value[0]["content"]) - - - if ref_speaker.value == "load_manually": - ref_speaker_path = f"{OUTPUT_DIR}/{upload_ref.value[0].name}" - save_audio(upload_ref, ref_speaker_path) - -.. code:: ipython3 - - Audio(ref_speaker_path) - - - - -.. raw:: html - - - - - - - -.. code:: ipython3 - - torch_hub_local = Path("torch_hub_local/") - %env TORCH_HOME={str(torch_hub_local.absolute())} - -Load speaker embeddings - -.. code:: ipython3 - - # second step to fix a problem with silero downloading and installing - import os - import zipfile - - url = "https://github.com/snakers4/silero-vad/zipball/v3.0" - - torch_hub_dir = torch_hub_local / "hub" - torch.hub.set_dir(torch_hub_dir.as_posix()) - - zip_filename = "v3.0.zip" - output_path = torch_hub_dir / "v3.0" - if not (torch_hub_dir / zip_filename).exists(): - download_file(url, directory=torch_hub_dir, filename=zip_filename) - zip_ref = zipfile.ZipFile((torch_hub_dir / zip_filename).as_posix(), "r") - zip_ref.extractall(path=output_path.as_posix()) - zip_ref.close() - - v3_dirs = [d for d in output_path.iterdir() if "snakers4-silero-vad" in d.as_posix()] - if len(v3_dirs) > 0 and not (torch_hub_dir / "snakers4_silero-vad_v3.0").exists(): - v3_dir = str(v3_dirs[0]) - os.rename(str(v3_dirs[0]), (torch_hub_dir / "snakers4_silero-vad_v3.0").as_posix()) - -.. code:: ipython3 - - en_source_default_se = torch.load(f"{en_suffix}/en_default_se.pth") - en_source_style_se = torch.load(f"{en_suffix}/en_style_se.pth") - zh_source_se = torch.load(f"{zh_suffix}/zh_default_se.pth") if enable_chinese_lang else None - - target_se, audio_name = se_extractor.get_se(ref_speaker_path, tone_color_converter, target_dir=OUTPUT_DIR, vad=True) - - -.. parsed-literal:: - - OpenVoice version: v1 - - -.. parsed-literal:: - - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/librosa/util/decorators.py:88: UserWarning: PySoundFile failed. Trying audioread instead. - return f(\*args, \*\*kwargs) - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/torch/hub.py:293: UserWarning: You are about to download and run code from an untrusted repository. In a future release, this won't be allowed. To add the repository to your trusted list, change the command to {calling_fn}(..., trust_repo=False) and a command prompt will appear asking for an explicit confirmation of trust, or load(..., trust_repo=True), which will assume that the prompt is to be answered with 'yes'. You can also use load(..., trust_repo='check') which will only prompt for confirmation if the repo is not already trusted. This will eventually be the default behaviour - warnings.warn( - Downloading: "https://github.com/snakers4/silero-vad/zipball/v3.0" to /home/ea/.cache/torch/hub/v3.0.zip - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1541: UserWarning: A window was not provided. A rectangular window will be applied,which is known to cause spectral leakage. Other windows such as torch.hann_window or torch.hamming_window can are recommended to reduce spectral leakage.To suppress this warning and use a rectangular window, explicitly set `window=torch.ones(n_fft, device=)`. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:836.) - return forward_call(\*args, \*\*kwargs) - - -.. parsed-literal:: - - [(0.0, 8.21), (9.292, 13.106), (13.228, 16.466), (16.684, 29.49225)] - after vad: dur = 28.07 - - -.. parsed-literal:: - - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/torch/functional.py:665: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. - Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:873.) - return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] - - -Replace original infer methods of ``OpenVoiceBaseClass`` with optimized -OpenVINO inference. - -There are pre and post processings that are not traceable and could not -be offloaded to OpenVINO, instead of writing such processing ourselves -we will rely on the already existing ones. We just replace infer and -voice conversion functions of ``OpenVoiceBaseClass`` so that the the -most computationally expensive part is done in OpenVINO. - -.. code:: ipython3 - - def get_pathched_infer(ov_model: ov.Model, device: str) -> callable: - compiled_model = core.compile_model(ov_model, device) - - def infer_impl(x, x_lengths, sid, noise_scale, length_scale, noise_scale_w): - ov_output = compiled_model((x, x_lengths, sid, noise_scale, length_scale, noise_scale_w)) - return (torch.tensor(ov_output[0]),) - - return infer_impl - - - def get_patched_voice_conversion(ov_model: ov.Model, device: str) -> callable: - compiled_model = core.compile_model(ov_model, device) - - def voice_conversion_impl(y, y_lengths, sid_src, sid_tgt, tau): - ov_output = compiled_model((y, y_lengths, sid_src, sid_tgt, tau)) - return (torch.tensor(ov_output[0]),) - - return voice_conversion_impl - - - en_base_speaker_tts.model.infer = get_pathched_infer(ov_en_tts, device.value) - tone_color_converter.model.voice_conversion = get_patched_voice_conversion(ov_voice_conversion, device.value) - if enable_chinese_lang: - zh_base_speaker_tts.model.infer = get_pathched_infer(ov_zh_tts, device.value) - -Run inference -~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - voice_source = widgets.Dropdown( - options=["use TTS", "choose_manually"], - value="use TTS", - description="Voice source", - disabled=False, - ) - - voice_source - - - - -.. parsed-literal:: - - Dropdown(description='Voice source', options=('use TTS', 'choose_manually'), value='use TTS') - - - -.. code:: ipython3 - - if voice_source.value == "choose_manually": - upload_orig_voice = widgets.FileUpload( - accept=allowed_audio_types, - multiple=False, - description="audo whose tone will be replaced", - ) - display(upload_orig_voice) - -.. code:: ipython3 - - if voice_source.value == "choose_manually": - orig_voice_path = f"{OUTPUT_DIR}/{upload_orig_voice.value[0].name}" - save_audio(upload_orig_voice, orig_voice_path) - source_se, _ = se_extractor.get_se(orig_voice_path, tone_color_converter, target_dir=OUTPUT_DIR, vad=True) - else: - text = """ - OpenVINO toolkit is a comprehensive toolkit for quickly developing applications and solutions that solve - a variety of tasks including emulation of human vision, automatic speech recognition, natural language processing, - recommendation systems, and many others. - """ - source_se = en_source_default_se - orig_voice_path = OUTPUT_DIR / "tmp.wav" - en_base_speaker_tts.tts(text, orig_voice_path, speaker="default", language="English") - - -.. parsed-literal:: - - > Text splitted to sentences. - OpenVINO toolkit is a comprehensive toolkit for quickly developing applications and solutions that solve a variety of tasks including emulation of human vision, - automatic speech recognition, natural language processing, recommendation systems, and many others. - > =========================== - ˈoʊpən vino* toolkit* ɪz ə ˌkɑmpɹiˈhɛnsɪv toolkit* fəɹ kˈwɪkli dɪˈvɛləpɪŋ ˌæpləˈkeɪʃənz ənd səˈluʃənz ðət sɑɫv ə vəɹˈaɪəti əv tæsks ˌɪnˈkludɪŋ ˌɛmjəˈleɪʃən əv ˈjumən ˈvɪʒən, - length:173 - length:173 - ˌɔtəˈmætɪk spitʃ ˌɹɛkɪgˈnɪʃən, ˈnætʃəɹəɫ ˈlæŋgwɪdʒ ˈpɹɑsɛsɪŋ, ˌɹɛkəmənˈdeɪʃən ˈsɪstəmz, ənd ˈmɛni ˈəðəɹz. - length:105 - length:105 - - -And finally, run voice tone conversion with OpenVINO optimized model - -.. code:: ipython3 - - tau_slider = widgets.FloatSlider( - value=0.3, - min=0.01, - max=2.0, - step=0.01, - description="tau", - disabled=False, - readout_format=".2f", - ) - tau_slider - - - - -.. parsed-literal:: - - FloatSlider(value=0.3, description='tau', max=2.0, min=0.01, step=0.01) - - - -.. code:: ipython3 - - resulting_voice_path = OUTPUT_DIR / "output_with_cloned_voice_tone.wav" - - tone_color_converter.convert( - audio_src_path=orig_voice_path, - src_se=source_se, - tgt_se=target_se, - output_path=resulting_voice_path, - tau=tau_slider.value, - message="@MyShell", - ) - -.. code:: ipython3 - - Audio(orig_voice_path) - - - - -.. raw:: html - - - - - - - -.. code:: ipython3 - - Audio(resulting_voice_path) - - - - -.. raw:: html - - - - - - - -Run OpenVoice Gradio interactive demo -------------------------------------- - - - -We can also use `Gradio `__ app to run TTS and -voice tone conversion online. - -.. code:: ipython3 - - import gradio as gr - import langid - - supported_languages = ["zh", "en"] - - - def predict_impl( - prompt, - style, - audio_file_pth, - agree, - output_dir, - tone_color_converter, - en_tts_model, - zh_tts_model, - en_source_default_se, - en_source_style_se, - zh_source_se, - ): - text_hint = "" - if not agree: - text_hint += "[ERROR] Please accept the Terms & Condition!\n" - gr.Warning("Please accept the Terms & Condition!") - return ( - text_hint, - None, - None, - ) - - language_predicted = langid.classify(prompt)[0].strip() - print(f"Detected language:{language_predicted}") - - if language_predicted not in supported_languages: - text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n" - gr.Warning(f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}") - - return ( - text_hint, - None, - ) - - if language_predicted == "zh": - tts_model = zh_tts_model - if zh_tts_model is None: - gr.Warning("TTS model for Chinece language was not loaded please set 'enable_chinese_lang=True`") - return ( - text_hint, - None, - ) - source_se = zh_source_se - language = "Chinese" - if style not in ["default"]: - text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n" - gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']") - return ( - text_hint, - None, - ) - else: - tts_model = en_tts_model - if style == "default": - source_se = en_source_default_se - else: - source_se = en_source_style_se - language = "English" - supported_styles = [ - "default", - "whispering", - "shouting", - "excited", - "cheerful", - "terrified", - "angry", - "sad", - "friendly", - ] - if style not in supported_styles: - text_hint += f"[ERROR] The style {style} is not supported for English, which should be in {*supported_styles,}\n" - gr.Warning(f"The style {style} is not supported for English, which should be in {*supported_styles,}") - return ( - text_hint, - None, - ) - - speaker_wav = audio_file_pth - - if len(prompt) < 2: - text_hint += "[ERROR] Please give a longer prompt text \n" - gr.Warning("Please give a longer prompt text") - return ( - text_hint, - None, - ) - if len(prompt) > 200: - text_hint += ( - "[ERROR] Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n" - ) - gr.Warning("Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo for your usage") - return ( - text_hint, - None, - ) - - # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference - try: - target_se, audio_name = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir=OUTPUT_DIR, vad=True) - except Exception as e: - text_hint += f"[ERROR] Get target tone color error {str(e)} \n" - gr.Warning("[ERROR] Get target tone color error {str(e)} \n") - return ( - text_hint, - None, - ) - - src_path = f"{output_dir}/tmp.wav" - tts_model.tts(prompt, src_path, speaker=style, language=language) - - save_path = f"{output_dir}/output.wav" - encode_message = "@MyShell" - tone_color_converter.convert( - audio_src_path=src_path, - src_se=source_se, - tgt_se=target_se, - output_path=save_path, - message=encode_message, - ) - - text_hint += "Get response successfully \n" - - return ( - text_hint, - src_path, - save_path, - ) - -.. code:: ipython3 - - from functools import partial - - - predict = partial( - predict_impl, - output_dir=OUTPUT_DIR, - tone_color_converter=tone_color_converter, - en_tts_model=en_base_speaker_tts, - zh_tts_model=zh_base_speaker_tts, - en_source_default_se=en_source_default_se, - en_source_style_se=en_source_style_se, - zh_source_se=zh_source_se, - ) - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/openvoice/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=predict) - - try: - demo.queue(max_size=2).launch(debug=True, height=1000) - except Exception: - demo.queue(max_size=2).launch(share=True, debug=True, height=1000) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ - -.. code:: ipython3 - - # please uncomment and run this cell for stopping gradio interface - # demo.close() - -Cleanup -------- - - - -.. code:: ipython3 - - # import shutil - # shutil.rmtree(CKPT_BASE_PATH) - # shutil.rmtree(IRS_PATH) - # shutil.rmtree(OUTPUT_DIR) diff --git a/docs/notebooks/optical-character-recognition-with-output.rst b/docs/notebooks/optical-character-recognition-with-output.rst deleted file mode 100644 index 773a666f64b755..00000000000000 --- a/docs/notebooks/optical-character-recognition-with-output.rst +++ /dev/null @@ -1,481 +0,0 @@ -Optical Character Recognition (OCR) with OpenVINO™ -================================================== - -This tutorial demonstrates how to perform optical character recognition -(OCR) with OpenVINO models. It is a continuation of the -`hello-detection `__ tutorial, -which shows only text detection. - -The -`horizontal-text-detection-0001 `__ -and -`text-recognition-0014 `__ -models are used together for text detection and then text recognition. - - -**Table of contents:** - - -- `Imports <#imports>`__ -- `Settings <#settings>`__ -- `Download Models <#download-models>`__ -- `Select inference device <#select-inference-device>`__ -- `Object Detection <#object-detection>`__ - - - `Load a Detection Model <#load-a-detection-model>`__ - - `Load an Image <#load-an-image>`__ - - `Do Inference <#do-inference>`__ - - `Get Detection Results <#get-detection-results>`__ - -- `Text Recognition <#text-recognition>`__ - - - `Load Text Recognition Model <#load-text-recognition-model>`__ - - `Do Inference <#do-inference>`__ - -- `Show Results <#show-results>`__ - - - `Show Detected Text Boxes and OCR Results for the - Image <#show-detected-text-boxes-and-ocr-results-for-the-image>`__ - - `Show the OCR Result per Bounding - Box <#show-the-ocr-result-per-bounding-box>`__ - - `Print Annotations in Plain Text - Format <#print-annotations-in-plain-text-format>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - # Install openvino package - %pip install -q "openvino>=2024.4.0" pillow opencv-python "matplotlib>=3.4" - -Imports -------- - - - -.. code:: ipython3 - - from pathlib import Path - - import cv2 - import matplotlib.pyplot as plt - import numpy as np - import openvino as ov - from IPython.display import Markdown, display - from PIL import Image - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("optical-character-recognition.ipynb") - -Settings --------- - - - -.. code:: ipython3 - - core = ov.Core() - - model_dir = Path("model") - precision = "FP16" - detection_model = "horizontal-text-detection-0001" - recognition_model = "text-recognition-0014" - - model_dir.mkdir(exist_ok=True) - -Download Models ---------------- - - - -The next cells will download the detection and recognition models. If -the models have been downloaded before, they will not be downloaded -again. - -.. code:: ipython3 - - from notebook_utils import download_ir_model - - detection_model_url = f"https://storage.openvinotoolkit.org/repositories/open_model_zoo/2023.0/models_bin/1/{detection_model}/{precision}/{detection_model}.xml" - - recognition_model_url = ( - f"https://storage.openvinotoolkit.org/repositories/open_model_zoo/2023.0/models_bin/1/{recognition_model}/{precision}/{recognition_model}.xml" - ) - - detection_model_path = model_dir / detection_model / precision / f"{detection_model}.xml" - if not detection_model_path.exists(): - detection_model_path = download_ir_model(detection_model_url, model_dir / detection_model / precision) - - recognition_model_path = model_dir / recognition_model / precision / f"{recognition_model}.xml" - - if not recognition_model_path.exists(): - recognition_model_path = download_ir_model(recognition_model_url, model_dir / recognition_model / precision) - - -.. parsed-literal:: - - 'model/horizontal-text-detection-0001/FP16/horizontal-text-detection-0001.xml' already exists. - 'model/horizontal-text-detection-0001/FP16/horizontal-text-detection-0001.bin' already exists. - 'model/text-recognition-0014/FP16/text-recognition-0014.xml' already exists. - 'model/text-recognition-0014/FP16/text-recognition-0014.bin' already exists. - - -Select inference device ------------------------ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Object Detection ----------------- - - - -Load a detection model, load an image, do inference and get the -detection inference result. - -Load a Detection Model -~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - detection_model = core.read_model(detection_model_path) - detection_compiled_model = core.compile_model(model=detection_model, device_name=device.value) - - detection_input_layer = detection_compiled_model.input(0) - -Load an Image -~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - image_path = Path("intel_rnb.jpg") - - if not image_path.exists(): - download_file( - url="https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/intel_rnb.jpg", - filename=image_path.name, - directory=image_path.parent, - ) - else: - image = cv2.imread(str(image_path)) - - # N,C,H,W = batch size, number of channels, height, width. - N, C, H, W = detection_input_layer.shape - - # Resize the image to meet network expected input sizes. - resized_image = cv2.resize(image, (W, H)) - - # Reshape to the network input shape. - input_image = np.expand_dims(resized_image.transpose(2, 0, 1), 0) - - plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)); - - - -.. image:: optical-character-recognition-with-output_files/optical-character-recognition-with-output_13_0.png - - -Do Inference -~~~~~~~~~~~~ - - - -Text boxes are detected in the images and returned as blobs of data in -the shape of ``[100, 5]``. Each description of detection has the -``[x_min, y_min, x_max, y_max, conf]`` format. - -.. code:: ipython3 - - output_key = detection_compiled_model.output("boxes") - boxes = detection_compiled_model([input_image])[output_key] - - # Remove zero only boxes. - boxes = boxes[~np.all(boxes == 0, axis=1)] - -Get Detection Results -~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - def multiply_by_ratio(ratio_x, ratio_y, box): - return [max(shape * ratio_y, 10) if idx % 2 else shape * ratio_x for idx, shape in enumerate(box[:-1])] - - - def run_preprocesing_on_crop(crop, net_shape): - temp_img = cv2.resize(crop, net_shape) - temp_img = temp_img.reshape((1,) * 2 + temp_img.shape) - return temp_img - - - def convert_result_to_image(bgr_image, resized_image, boxes, threshold=0.3, conf_labels=True): - # Define colors for boxes and descriptions. - colors = {"red": (255, 0, 0), "green": (0, 255, 0), "white": (255, 255, 255)} - - # Fetch image shapes to calculate a ratio. - (real_y, real_x), (resized_y, resized_x) = image.shape[:2], resized_image.shape[:2] - ratio_x, ratio_y = real_x / resized_x, real_y / resized_y - - # Convert the base image from BGR to RGB format. - rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB) - - # Iterate through non-zero boxes. - for box, annotation in boxes: - # Pick a confidence factor from the last place in an array. - conf = box[-1] - if conf > threshold: - # Convert float to int and multiply position of each box by x and y ratio. - (x_min, y_min, x_max, y_max) = map(int, multiply_by_ratio(ratio_x, ratio_y, box)) - - # Draw a box based on the position. Parameters in the `rectangle` function are: image, start_point, end_point, color, thickness. - cv2.rectangle(rgb_image, (x_min, y_min), (x_max, y_max), colors["green"], 3) - - # Add a text to an image based on the position and confidence. Parameters in the `putText` function are: image, text, bottomleft_corner_textfield, font, font_scale, color, thickness, line_type - if conf_labels: - # Create a background box based on annotation length. - (text_w, text_h), _ = cv2.getTextSize(f"{annotation}", cv2.FONT_HERSHEY_TRIPLEX, 0.8, 1) - image_copy = rgb_image.copy() - cv2.rectangle( - image_copy, - (x_min, y_min - text_h - 10), - (x_min + text_w, y_min - 10), - colors["white"], - -1, - ) - # Add weighted image copy with white boxes under a text. - cv2.addWeighted(image_copy, 0.4, rgb_image, 0.6, 0, rgb_image) - cv2.putText( - rgb_image, - f"{annotation}", - (x_min, y_min - 10), - cv2.FONT_HERSHEY_SIMPLEX, - 0.8, - colors["red"], - 1, - cv2.LINE_AA, - ) - - return rgb_image - -Text Recognition ----------------- - - - -Load the text recognition model and do inference on the detected boxes -from the detection model. - -Load Text Recognition Model -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - recognition_model = core.read_model(recognition_model_path) - - recognition_compiled_model = core.compile_model(model=recognition_model, device_name=device.value) - - recognition_output_layer = recognition_compiled_model.output(0) - recognition_input_layer = recognition_compiled_model.input(0) - - # Get the height and width of the input layer. - _, _, H, W = recognition_input_layer.shape - -Do Inference -~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # Calculate scale for image resizing. - (real_y, real_x), (resized_y, resized_x) = image.shape[:2], resized_image.shape[:2] - ratio_x, ratio_y = real_x / resized_x, real_y / resized_y - - # Convert the image to grayscale for the text recognition model. - grayscale_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - - # Get a dictionary to encode output, based on the model documentation. - letters = "#1234567890abcdefghijklmnopqrstuvwxyz" - - # Prepare an empty list for annotations. - annotations = list() - cropped_images = list() - # fig, ax = plt.subplots(len(boxes), 1, figsize=(5,15), sharex=True, sharey=True) - # Get annotations for each crop, based on boxes given by the detection model. - for i, crop in enumerate(boxes): - # Get coordinates on corners of a crop. - (x_min, y_min, x_max, y_max) = map(int, multiply_by_ratio(ratio_x, ratio_y, crop)) - image_crop = run_preprocesing_on_crop(grayscale_image[y_min:y_max, x_min:x_max], (W, H)) - - # Run inference with the recognition model. - result = recognition_compiled_model([image_crop])[recognition_output_layer] - - # Squeeze the output to remove unnecessary dimension. - recognition_results_test = np.squeeze(result) - - # Read an annotation based on probabilities from the output layer. - annotation = list() - for letter in recognition_results_test: - parsed_letter = letters[letter.argmax()] - - # Returning 0 index from `argmax` signalizes an end of a string. - if parsed_letter == letters[0]: - continue - annotation.append(parsed_letter) - annotations.append("".join(annotation)) - cropped_image = Image.fromarray(image[y_min:y_max, x_min:x_max]) - cropped_images.append(cropped_image) - - boxes_with_annotations = list(zip(boxes, annotations)) - -Show Results ------------- - - - -Show Detected Text Boxes and OCR Results for the Image -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Visualize the result by drawing boxes around recognized text and showing -the OCR result from the text recognition model. - -.. code:: ipython3 - - plt.figure(figsize=(12, 12)) - plt.imshow(convert_result_to_image(image, resized_image, boxes_with_annotations, conf_labels=True)); - - - -.. image:: optical-character-recognition-with-output_files/optical-character-recognition-with-output_23_0.png - - -Show the OCR Result per Bounding Box -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Depending on the image, the OCR result may not be readable in the image -with boxes, as displayed in the cell above. Use the code below to -display the extracted boxes and the OCR result per box. - -.. code:: ipython3 - - for cropped_image, annotation in zip(cropped_images, annotations): - display(cropped_image, Markdown("".join(annotation))) - - - -.. image:: optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_0.png - - - -buildinng - - - -.. image:: optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_2.png - - - -noyce - - - -.. image:: optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_4.png - - - -2200 - - - -.. image:: optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_6.png - - - -n - - - -.. image:: optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_8.png - - - -center - - - -.. image:: optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_10.png - - - -robert - - -Print Annotations in Plain Text Format -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Print annotations for detected text based on their position in the input -image, starting from the upper left corner. - -.. code:: ipython3 - - [annotation for _, annotation in sorted(zip(boxes, annotations), key=lambda x: x[0][0] ** 2 + x[0][1] ** 2)] - - - - -.. parsed-literal:: - - ['robert', 'n', 'noyce', 'buildinng', '2200', 'center'] - - diff --git a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_13_0.png b/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_13_0.png deleted file mode 100644 index ebf16a1227e05a..00000000000000 --- a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_13_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b33f8353381b9720d63b71906ca069ea3356631b5e8cf768688f4cf9d91a8c62 -size 305482 diff --git a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_23_0.png b/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_23_0.png deleted file mode 100644 index ec3b25d807c904..00000000000000 --- a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_23_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f632a329780c74a18501c7957b6d1312f19e3a16886baa4f218a1b1b79ac3bac -size 923830 diff --git a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_0.jpg b/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_0.jpg deleted file mode 100644 index 6422dd9abf672b..00000000000000 --- a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a4890b864f6944a9b3c83f4ce079f0deccaaa61acf0d2fd712cae6baee430477 -size 1996 diff --git a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_0.png b/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_0.png deleted file mode 100644 index bd61b72ac1412f..00000000000000 --- a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3afa4268972711c6dd52892c4fa0f9600df3ae40bbec42d5867b83fd75f9308c -size 11367 diff --git a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_10.jpg b/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_10.jpg deleted file mode 100644 index ae1d914ec98647..00000000000000 --- a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_10.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2cb4dd58ab1a6cdd6e27c6f01d8ca70e0bb91c93f7c0998e320d6fce690abf2e -size 1990 diff --git a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_10.png b/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_10.png deleted file mode 100644 index 5a55d5db6ea87b..00000000000000 --- a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_10.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:94b08d3c87c16bdeb7e045d07cc22a3f4c5fcd8ad1072922d738ca439ecfe5b5 -size 11142 diff --git a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_2.jpg b/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_2.jpg deleted file mode 100644 index a312084092663a..00000000000000 --- a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_2.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:db2f98f6c01ae07afaaa599619c79520bf9ddd5ae50c278049425de39de5c605 -size 1630 diff --git a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_2.png b/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_2.png deleted file mode 100644 index 2ed7e4fecac86f..00000000000000 --- a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_2.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c8e7c7ac51dcffb1ba3a99d88260b978cec59f0b0f9b40fc3da3097782bd6150 -size 8428 diff --git a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_4.jpg b/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_4.jpg deleted file mode 100644 index 4b934bb35144c0..00000000000000 --- a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_4.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5ef2170c4f2461cddaa3ddb96e2f6b33c57e5c4720df0043304a255f2156616a -size 949 diff --git a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_4.png b/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_4.png deleted file mode 100644 index e2b8c366cb15cb..00000000000000 --- a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_4.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fc5555e26200b462cca21e304ece792ca2ddab2c5dd86e6d4ec8144f8eca9f92 -size 2274 diff --git a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_6.jpg b/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_6.jpg deleted file mode 100644 index d42c334e3a7e51..00000000000000 --- a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_6.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:71268d52cc134e9cc3db5d406f510d159335c9dbadfce61bc4cd126e14f22e0b -size 817 diff --git a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_6.png b/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_6.png deleted file mode 100644 index fc0e170ddf7b84..00000000000000 --- a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_6.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4743bff968c844ca140b02de6a81ad1c7bff1b6fb5631914a23fc0b142078fad -size 1559 diff --git a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_8.jpg b/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_8.jpg deleted file mode 100644 index 3248b4a831a130..00000000000000 --- a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_8.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c6732070cf998297b36d1906b85aee083c26381b6c88fdba66318856dae85e08 -size 838 diff --git a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_8.png b/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_8.png deleted file mode 100644 index 6a1fd3b1812d45..00000000000000 --- a/docs/notebooks/optical-character-recognition-with-output_files/optical-character-recognition-with-output_25_8.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:be1b9a415db80e697ee0a86073d6155ce197d324a0161162e819c5c0995fbcb1 -size 1487 diff --git a/docs/notebooks/optimize-preprocessing-with-output.rst b/docs/notebooks/optimize-preprocessing-with-output.rst deleted file mode 100644 index c4b8424647d3d0..00000000000000 --- a/docs/notebooks/optimize-preprocessing-with-output.rst +++ /dev/null @@ -1,643 +0,0 @@ -Optimize Preprocessing -====================== - -When input data does not fit the model input tensor perfectly, -additional operations/steps are needed to transform the data to the -format expected by the model. This tutorial demonstrates how it could be -performed with Preprocessing API. Preprocessing API is an easy-to-use -instrument, that enables integration of preprocessing steps into an -execution graph and performing it on a selected device, which can -improve device utilization. For more information about Preprocessing -API, see this -`overview `__ -and -`details `__ - -This tutorial include following steps: - -- Downloading the model. -- Setup preprocessing with Preprocessing API, loading the model and - inference with original image. -- Fitting image to the model input type and inference with prepared - image. -- Comparing results on one picture. -- Comparing performance. - - -**Table of contents:** - - -- `Settings <#settings>`__ -- `Imports <#imports>`__ - - - `Setup image and device <#setup-image-and-device>`__ - - `Downloading the model <#downloading-the-model>`__ - - `Create core <#create-core>`__ - - `Check the original parameters of - image <#check-the-original-parameters-of-image>`__ - -- `Setup preprocessing steps with Preprocessing API and perform - inference <#setup-preprocessing-steps-with-preprocessing-api-and-perform-inference>`__ - - - `Convert model to OpenVINO IR with model conversion - API <#convert-model-to-openvino-ir-with-model-conversion-api>`__ - - `Create PrePostProcessor - Object <#create-prepostprocessor-object>`__ - - `Declare User’s Data Format <#declare-users-data-format>`__ - - `Declaring Model Layout <#declaring-model-layout>`__ - - `Preprocessing Steps <#preprocessing-steps>`__ - - `Integrating Steps into a - Model <#integrating-steps-into-a-model>`__ - -- `Load model and perform - inference <#load-model-and-perform-inference>`__ -- `Fit image manually and perform - inference <#fit-image-manually-and-perform-inference>`__ - - - `Load the model <#load-the-model>`__ - - `Load image and fit it to model - input <#load-image-and-fit-it-to-model-input>`__ - - `Perform inference <#perform-inference>`__ - -- `Compare results <#compare-results>`__ - - - `Compare results on one image <#compare-results-on-one-image>`__ - - `Compare performance <#compare-performance>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Settings --------- - - - -.. code:: ipython3 - - # Install openvino package - %pip install -q "openvino>=2023.1.0" opencv-python tqdm "matplotlib>=3.4" - - %pip install -q "tensorflow-macos>=2.5; sys_platform == 'darwin' and platform_machine == 'arm64' and python_version > '3.8'" # macOS M1 and M2 - %pip install -q "tensorflow>=2.5; sys_platform == 'darwin' and platform_machine != 'arm64' and python_version > '3.8'" # macOS x86 - %pip install -q "tensorflow>=2.5; sys_platform != 'darwin' and python_version > '3.8'" - %pip install -q --no-deps tensorflow_hub - %pip install -q tf_keras - -Imports -------- - - - -.. code:: ipython3 - - import time - import os - from pathlib import Path - - os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" - os.environ["TF_USE_LEGACY_KERAS"] = "1" - - import cv2 - import matplotlib.pyplot as plt - import numpy as np - import openvino as ov - import tensorflow as tf - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("optimize-preprocessing.ipynb") - - -.. parsed-literal:: - - 2023-07-10 12:05:28.419803: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2023-07-10 12:05:28.457913: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2023-07-10 12:05:29.065916: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - - -Setup image and device -~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # Download the image from the openvino_notebooks storage - image_path = Path("data/coco.jpg") - - if not image_path.exists(): - image_path = download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco.jpg", - directory="data", - ) - image_path = str(image_path) - -.. code:: ipython3 - - core = ov.Core() - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -Downloading the model -~~~~~~~~~~~~~~~~~~~~~ - - - -This tutorial uses the -`InceptionResNetV2 `__. -The InceptionResNetV2 model is the second of the -`Inception `__ -family of models designed to perform image classification. Like other -Inception models, InceptionResNetV2 has been pre-trained on the -`ImageNet `__ data set. For more details about -this family of models, see the `research -paper `__. - -Load the model by using `tf.keras.applications -api `__ -and save it to the disk. - -.. code:: ipython3 - - model_name = "InceptionResNetV2" - - model_dir = Path("model") - model_dir.mkdir(exist_ok=True) - - model_path = model_dir / model_name - - model = tf.keras.applications.InceptionV3() - model.save(model_path) - - -.. parsed-literal:: - - 2023-09-07 13:15:54.259701: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform. - Skipping registering GPU devices... - - -.. parsed-literal:: - - WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model. - INFO:tensorflow:Assets written to: model/InceptionResNetV2/assets - - -.. parsed-literal:: - - INFO:tensorflow:Assets written to: model/InceptionResNetV2/assets - - -Create core -~~~~~~~~~~~ - - - -.. code:: ipython3 - - core = ov.Core() - -Check the original parameters of image -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - image = cv2.imread(image_path) - plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) - print(f"The original shape of the image is {image.shape}") - print(f"The original data type of the image is {image.dtype}") - - -.. parsed-literal:: - - The original shape of the image is (577, 800, 3) - The original data type of the image is uint8 - - - -.. image:: optimize-preprocessing-with-output_files/optimize-preprocessing-with-output_14_1.png - - -Setup preprocessing steps with Preprocessing API and perform inference ----------------------------------------------------------------------- - - - -Intuitively, preprocessing API consists of the following parts: - -- Tensor - declares user data format, like shape, layout, precision, - color format from actual user’s data. -- Steps - describes sequence of preprocessing steps which need to be - applied to user data. -- Model - specifies model data format. Usually, precision and shape are - already known for model, only additional information, like layout can - be specified. - -Graph modifications of a model shall be performed after the model is -read from a drive and before it is loaded on the actual device. - -Pre-processing support following operations (please, see more details -`here `__) - -- Mean/Scale Normalization -- Converting Precision -- Converting layout (transposing) -- Resizing Image -- Color Conversion -- Custom Operations - -Convert model to OpenVINO IR with model conversion API -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The options for preprocessing are not required. - -.. code:: ipython3 - - ir_path = model_dir / "ir_model" / f"{model_name}.xml" - - ppp_model = None - - if ir_path.exists(): - ppp_model = core.read_model(model=ir_path) - print(f"Model in OpenVINO format already exists: {ir_path}") - else: - ppp_model = ov.convert_model(model_path, input=[1, 299, 299, 3]) - ov.save_model(ppp_model, str(ir_path)) - - -.. parsed-literal:: - - Model in OpenVINO format already exists: model/ir_model/InceptionResNetV2.xml - - -Create ``PrePostProcessor`` Object -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The -`PrePostProcessor() `__ -class enables specifying the preprocessing and postprocessing steps for -a model. - -.. code:: ipython3 - - from openvino.preprocess import PrePostProcessor - - ppp = PrePostProcessor(ppp_model) - -Declare User’s Data Format -~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To address particular input of a model/preprocessor, use the -``PrePostProcessor.input(input_name)`` method. If the model has only one -input, then simple ``PrePostProcessor.input()`` will get a reference to -pre-processing builder for this input (a tensor, the steps, a model). In -general, when a model has multiple inputs/outputs, each one can be -addressed by a tensor name or by its index. By default, information -about user’s input tensor will be initialized to same data -(type/shape/etc) as model’s input parameter. User application can -override particular parameters according to application’s data. Refer to -the following -`page `__ -for more information about parameters for overriding. - -Below is all the specified input information: - -- Precision is ``U8`` (unsigned 8-bit integer). -- Size is non-fixed, setup of one determined shape size can be done - with ``.set_shape([1, 577, 800, 3])`` -- Layout is ``“NHWC”``. It means, for example: height=577, width=800, - channels=3. - -The height and width are necessary for resizing, and channels are needed -for mean/scale normalization. - -.. code:: ipython3 - - # setup formant of data - ppp.input().tensor().set_element_type(ov.Type.u8).set_spatial_dynamic_shape().set_layout(ov.Layout("NHWC")) - - - - -.. parsed-literal:: - - - - - -Declaring Model Layout -~~~~~~~~~~~~~~~~~~~~~~ - - - -Model input already has information about precision and shape. -Preprocessing API is not intended to modify this. The only thing that -may be specified is input data -`layout `__. - -.. code:: ipython3 - - input_layer_ir = next(iter(ppp_model.inputs)) - print(f"The input shape of the model is {input_layer_ir.shape}") - - ppp.input().model().set_layout(ov.Layout("NHWC")) - - -.. parsed-literal:: - - The input shape of the model is [1,299,299,3] - - - - -.. parsed-literal:: - - - - - -Preprocessing Steps -~~~~~~~~~~~~~~~~~~~ - - - -Now, the sequence of preprocessing steps can be defined. For more -information about preprocessing steps, see -`here `__. - -Perform the following: - -- Convert ``U8`` to ``FP32`` precision. -- Resize to height/width of a model. Be aware that if a model accepts - dynamic size, for example, ``{?, 3, ?, ?}`` resize will not know how - to resize the picture. Therefore, in this case, target height/ width - should be specified. For more details, see also the - `PreProcessSteps.resize() `__. -- Subtract mean from each channel. -- Divide each pixel data to appropriate scale value. - -There is no need to specify conversion layout. If layouts are different, -then such conversion will be added explicitly. - -.. code:: ipython3 - - from openvino.preprocess import ResizeAlgorithm - - ppp.input().preprocess().convert_element_type(ov.Type.f32).resize(ResizeAlgorithm.RESIZE_LINEAR).mean([127.5, 127.5, 127.5]).scale([127.5, 127.5, 127.5]) - - - - -.. parsed-literal:: - - - - - -Integrating Steps into a Model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Once the preprocessing steps have been finished, the model can be -finally built. It is possible to display ``PrePostProcessor`` -configuration for debugging purposes. - -.. code:: ipython3 - - print(f"Dump preprocessor: {ppp}") - model_with_preprocess = ppp.build() - - -.. parsed-literal:: - - Dump preprocessor: Input "Func/StatefulPartitionedCall/input/_0:0": - User's input tensor: [1,?,?,3], [N,H,W,C], u8 - Model's expected tensor: [1,299,299,3], [N,H,W,C], f32 - Pre-processing steps (4): - convert type (f32): ([1,?,?,3], [N,H,W,C], u8) -> ([1,?,?,3], [N,H,W,C], f32) - resize to model width/height: ([1,?,?,3], [N,H,W,C], f32) -> ([1,299,299,3], [N,H,W,C], f32) - mean (127.5,127.5,127.5): ([1,299,299,3], [N,H,W,C], f32) -> ([1,299,299,3], [N,H,W,C], f32) - scale (127.5,127.5,127.5): ([1,299,299,3], [N,H,W,C], f32) -> ([1,299,299,3], [N,H,W,C], f32) - - - -Load model and perform inference --------------------------------- - - - -.. code:: ipython3 - - def prepare_image_api_preprocess(image_path, model=None): - image = cv2.imread(image_path) - input_tensor = np.expand_dims(image, 0) - return input_tensor - - - compiled_model_with_preprocess_api = core.compile_model(model=ppp_model, device_name=device.value) - - ppp_output_layer = compiled_model_with_preprocess_api.output(0) - - ppp_input_tensor = prepare_image_api_preprocess(image_path) - results = compiled_model_with_preprocess_api(ppp_input_tensor)[ppp_output_layer][0] - -Fit image manually and perform inference ----------------------------------------- - - - -Load the model -~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - model = core.read_model(model=ir_path) - compiled_model = core.compile_model(model=model, device_name=device.value) - -Load image and fit it to model input -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - def manual_image_preprocessing(path_to_image, compiled_model): - input_layer_ir = next(iter(compiled_model.inputs)) - - # N, H, W, C = batch size, height, width, number of channels - N, H, W, C = input_layer_ir.shape - - # load image, image will be resized to model input size and converted to RGB - img = tf.keras.preprocessing.image.load_img(image_path, target_size=(H, W), color_mode="rgb") - - x = tf.keras.preprocessing.image.img_to_array(img) - x = np.expand_dims(x, axis=0) - - # will scale input pixels between -1 and 1 - input_tensor = tf.keras.applications.inception_resnet_v2.preprocess_input(x) - - return input_tensor - - - input_tensor = manual_image_preprocessing(image_path, compiled_model) - print(f"The shape of the image is {input_tensor.shape}") - print(f"The data type of the image is {input_tensor.dtype}") - - -.. parsed-literal:: - - The shape of the image is (1, 299, 299, 3) - The data type of the image is float32 - - -Perform inference -~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - output_layer = compiled_model.output(0) - - result = compiled_model(input_tensor)[output_layer] - -Compare results ---------------- - - - -Compare results on one image -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - def check_results(input_tensor, compiled_model, imagenet_classes): - output_layer = compiled_model.output(0) - - results = compiled_model(input_tensor)[output_layer][0] - - top_indices = np.argsort(results)[-5:][::-1] - top_softmax = results[top_indices] - - for index, softmax_probability in zip(top_indices, top_softmax): - print(f"{imagenet_classes[index]}, {softmax_probability:.5f}") - - return top_indices, top_softmax - - - # Convert the inference result to a class name. - imagenet_filename = Path("data/imagenet_2012.txt") - - if not imagenet_filename.exists(): - imagenet_filename = download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/datasets/imagenet/imagenet_2012.txt", - directory="data", - ) - imagenet_classes = imagenet_filename.read_text().splitlines() - imagenet_classes = ["background"] + imagenet_classes - - # get result for inference with preprocessing api - print("Result of inference with Preprocessing API:") - res = check_results(ppp_input_tensor, compiled_model_with_preprocess_api, imagenet_classes) - - print("\n") - - # get result for inference with the manual preparing of the image - print("Result of inference with manual image setup:") - res = check_results(input_tensor, compiled_model, imagenet_classes) - - -.. parsed-literal:: - - Result of inference with Preprocessing API: - n02099601 golden retriever, 0.80578 - n02098413 Lhasa, Lhasa apso, 0.09990 - n02108915 French bulldog, 0.01920 - n02111129 Leonberg, 0.00829 - n02097047 miniature schnauzer, 0.00294 - - - Result of inference with manual image setup: - n02098413 Lhasa, Lhasa apso, 0.76843 - n02099601 golden retriever, 0.19322 - n02111129 Leonberg, 0.00720 - n02097047 miniature schnauzer, 0.00287 - n02100877 Irish setter, red setter, 0.00115 - - -Compare performance -~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - def check_performance(compiled_model, preprocessing_function=None): - num_images = 1000 - - start = time.perf_counter() - - for _ in range(num_images): - input_tensor = preprocessing_function(image_path, compiled_model) - compiled_model(input_tensor) - - end = time.perf_counter() - time_ir = end - start - - return time_ir, num_images - - - time_ir, num_images = check_performance(compiled_model, manual_image_preprocessing) - print(f"IR model in OpenVINO Runtime/CPU with manual image preprocessing: {time_ir/num_images:.4f} " f"seconds per image, FPS: {num_images/time_ir:.2f}") - - time_ir, num_images = check_performance(compiled_model_with_preprocess_api, prepare_image_api_preprocess) - print(f"IR model in OpenVINO Runtime/CPU with preprocessing API: {time_ir/num_images:.4f} " f"seconds per image, FPS: {num_images/time_ir:.2f}") - - -.. parsed-literal:: - - IR model in OpenVINO Runtime/CPU with manual image preprocessing: 0.0162 seconds per image, FPS: 61.85 - IR model in OpenVINO Runtime/CPU with preprocessing API: 0.0204 seconds per image, FPS: 48.97 - diff --git a/docs/notebooks/optimize-preprocessing-with-output_files/optimize-preprocessing-with-output_14_1.png b/docs/notebooks/optimize-preprocessing-with-output_files/optimize-preprocessing-with-output_14_1.png deleted file mode 100644 index 15c5ba574c1614..00000000000000 --- a/docs/notebooks/optimize-preprocessing-with-output_files/optimize-preprocessing-with-output_14_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7511b8a4e5b047600d5fed14fbc7e9653a868bc5253abf1e0c3ef649b47bc408 -size 387941 diff --git a/docs/notebooks/outetts-text-to-speech-with-output.rst b/docs/notebooks/outetts-text-to-speech-with-output.rst deleted file mode 100644 index eccb8ff961102c..00000000000000 --- a/docs/notebooks/outetts-text-to-speech-with-output.rst +++ /dev/null @@ -1,374 +0,0 @@ -Text-to-Speech synthesis using OuteTTS and OpenVINO -=================================================== - -.. warning:: - - Important note: This notebook requires python >= 3.10. Please make - sure that your environment fulfill to this requirement before running - it - -`OuteTTS-0.1-350M `__ is -a novel text-to-speech synthesis model that leverages pure language -modeling without external adapters or complex architectures, built upon -the LLaMa architecture. It demonstrates that high-quality speech -synthesis is achievable through a straightforward approach using crafted -prompts and audio tokens. - -More details about model can be found in `original -repo `__. - -In this tutorial we consider how to run OuteTTS pipeline using OpenVINO. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Convert model <#convert-model>`__ -- `Run model inference <#run-model-inference>`__ - - - `Text-to-Speech generation <#text-to-speech-generation>`__ - - `Text-to-Speech generation with Voice - Cloning <#text-to-speech-generation-with-voice-cloning>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import platform - - %pip install -q "torch>=2.1" "torchaudio" "einops" "transformers>=4.46.1" "loguru" "inflect" "pesq" "torchcrepe" "natsort" "polars" uroman mecab-python3 unidic-lite --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "gradio>=4.19" "openvino>=2024.4.0" "tqdm" "pyyaml" "librosa" "soundfile" - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - -.. code:: ipython3 - - import requests - from pathlib import Path - - utility_files = ["cmd_helper.py", "notebook_utils.py"] - base_utility_url = "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/" - - for utility_file in utility_files: - if not Path(utility_file).exists(): - r = requests.get(base_utility_url + utility_file) - with Path(utility_file).open("w") as f: - f.write(r.text) - - - helper_files = ["gradio_helper.py", "ov_outetts_helper.py"] - base_helper_url = "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/outetts-text-to-speech" - - for helper_file in helper_files: - if not Path(helper_file).exists(): - r = requests.get(base_helper_url + helper_file) - with Path(helper_file).open("w") as f: - f.write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("outetts-text-to-speech.ipynb") - -.. code:: ipython3 - - from cmd_helper import clone_repo - - repo_path = clone_repo("https://github.com/edwko/OuteTTS.git") - - interface_path = repo_path / "outetts/version/v1/interface.py" - - updated_version = interface_path.exists() - - if not updated_version: - interface_pth = repo_path / "outetts/v0_1/interface.py" - orig_interface_path = interface_path.parent / "_orig_interface.py" - - if not updated_version and not orig_interface_path.exists(): - interface_path.rename(orig_interface_path) - # sounddevice requires to install manually additional libraries, as we do not plan to use it for audio playing - # move it closer to its usage for avoid errors - with orig_interface_path.open("r") as in_file: - content = in_file.read() - upd_content = content.replace("import sounddevice as sd", "") - upd_content = upd_content.replace("sd.play", "import sounddevice as sd\n sd.play") - with interface_path.open("w") as out_file: - out_file.write(upd_content) - - %pip install -q {repo_path} --extra-index-url https://download.pytorch.org/whl/cpu - -Convert model -------------- - - - -OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate -Representation format. For convenience, we will use OpenVINO integration -with HuggingFace Optimum. `Optimum -Intel `__ is the -interface between the Transformers and Diffusers libraries and the -different tools and libraries provided by Intel to accelerate end-to-end -pipelines on Intel architectures. - -Among other use cases, Optimum Intel provides a simple interface to -optimize your Transformers and Diffusers models, convert them to the -OpenVINO Intermediate Representation (IR) format and run inference using -OpenVINO Runtime. ``optimum-cli`` provides command line interface for -model conversion and optimization. - -General command format: - -.. code:: bash - - optimum-cli export openvino --model --task - -where task is task to export the model for, if not specified, the task -will be auto-inferred based on the model. You can find a mapping between -tasks and model classes in Optimum TaskManager -`documentation `__. -Additionally, you can specify weights compression using -``--weight-format`` argument with one of following options: ``fp32``, -``fp16``, ``int8`` and ``int4``. For int8 and int4 -`nncf `__ will be used for -weight compression. More details about model export provided in `Optimum -Intel -documentation `__. - -As OuteTTS utilizes pure language modeling approach, model conversion -process remains the same like conversion LLaMa models family for text -generation purposes. - -.. code:: ipython3 - - from cmd_helper import optimum_cli - - model_id = "OuteAI/OuteTTS-0.1-350M" - model_dir = Path(model_id.split("/")[-1] + "-ov") - - if not model_dir.exists(): - optimum_cli(model_id, model_dir, additional_args={"task": "text-generation-with-past"}) - -Run model inference -------------------- - - - -OpenVINO integration with Optimum Intel provides ready-to-use API for -model inference that can be used for smooth integration with -transformers-based solutions. For loading model, we will use -``OVModelForCausalLM`` class that have compatible interface with -Transformers LLaMa implementation. For loading a model, -``from_pretrained`` method should be used. It accepts path to the model -directory or model_id from HuggingFace hub (if model is not converted to -OpenVINO format, conversion will be triggered automatically). -Additionally, we can provide an inference device, quantization config -(if model has not been quantized yet) and device-specific OpenVINO -Runtime configuration. More details about model inference with Optimum -Intel can be found in -`documentation `__. -We will use ``OVModelForCausalLM`` as replacement of original -``AutoModelForCausalLM`` in ``InterfaceHF``. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget(exclude=["NPU"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - from ov_outetts_helper import InterfaceOV, OVHFModel # noqa: F401 - - # Uncomment these lines to see pipeline details - # ??InterfaceOV - # ??OVHFModel - - -.. parsed-literal:: - - 2024-11-29 11:48:51.975233: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-29 11:48:51.989550: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered - WARNING: All log messages before absl::InitializeLog() is called are written to STDERR - E0000 00:00:1732866532.005718 2314480 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered - E0000 00:00:1732866532.010517 2314480 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered - 2024-11-29 11:48:52.027376: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - - -.. code:: ipython3 - - interface = InterfaceOV(model_dir, device.value) - - -.. parsed-literal:: - - making attention of type 'vanilla' with 768 in_channels - - -Text-to-Speech generation -~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Now let’s see model in action. Providing input text to ``generate`` -method of interface, model returns tensor that represents output audio -with random speaker characteristics. - -.. code:: ipython3 - - output = interface.generate(text="Hello, I'm working!", temperature=0.1, repetition_penalty=1.1, max_length=4096) - - -.. parsed-literal:: - - The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. - Setting `pad_token_id` to `eos_token_id`:None for open-end generation. - The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. - - -.. code:: ipython3 - - import IPython.display as ipd - - ipd.Audio(output.audio[0].numpy(), rate=output.sr) - - - - -.. raw:: html - - - - - - - -Text-to-Speech generation with Voice Cloning -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Additionally, we can specify reference voice for generation by providing -reference audio and transcript for it. ``interface.create_speaker`` -processes reference audio and text to set of features used for audio -description. - -.. code:: ipython3 - - from notebook_utils import download_file - - ref_audio_url = "https://huggingface.co/OuteAI/OuteTTS-0.1-350M/resolve/main/samples/2.wav" - file_path = Path("2.wav") - - if not file_path.exists(): - file_path = download_file(ref_audio_url) - - -.. parsed-literal:: - - '2.wav' already exists. - - -.. code:: ipython3 - - ipd.Audio(file_path) - - - - -.. raw:: html - - - - - - - -.. code:: ipython3 - - speaker = interface.create_speaker(file_path, "Hello, I can speak pretty well, but sometimes I make some mistakes.") - - # Save the speaker to a file - interface.save_speaker(speaker, "speaker.pkl") - - # Load the speaker from a file - speaker = interface.load_speaker("speaker.pkl") - - # Generate TTS with the custom voice - output = interface.generate(text="This is a cloned voice speaking", speaker=speaker, temperature=0.1, repetition_penalty=1.1, max_length=4096) - - -.. parsed-literal:: - - The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. - Setting `pad_token_id` to `eos_token_id`:None for open-end generation. - - -.. code:: ipython3 - - ipd.Audio(output.audio[0].numpy(), rate=output.sr) - - - - -.. raw:: html - - - - - - - -Interactive demo ----------------- - - - -.. code:: ipython3 - - from gradio_helper import make_demo - - demo = make_demo(interface) - - try: - demo.launch(debug=True) - except Exception: - demo.launch(share=True, debug=True) diff --git a/docs/notebooks/paddle-ocr-webcam-with-output.rst b/docs/notebooks/paddle-ocr-webcam-with-output.rst deleted file mode 100644 index cc7de18d078726..00000000000000 --- a/docs/notebooks/paddle-ocr-webcam-with-output.rst +++ /dev/null @@ -1,668 +0,0 @@ -PaddleOCR with OpenVINO™ -======================== - -This demo shows how to run PP-OCR model on OpenVINO natively. Instead of -exporting the PaddlePaddle model to ONNX and then converting to the -OpenVINO Intermediate Representation (OpenVINO IR) format with model -conversion API, you can now read directly from the PaddlePaddle Model -without any conversions. -`PaddleOCR `__ is an -ultra-light OCR model trained with PaddlePaddle deep learning framework, -that aims to create multilingual and practical OCR tools. - -The PaddleOCR pre-trained model used in the demo refers to the *“Chinese -and English ultra-lightweight PP-OCR model (9.4M)”*. More open source -pre-trained models can be downloaded at `PaddleOCR -GitHub `__ or `PaddleOCR -Gitee `__. Working pipeline of -the PaddleOCR is as follows: - - **NOTE**: To use this notebook with a webcam, you need to run the - notebook on a computer with a webcam. If you run the notebook on a - server, the webcam will not work. You can still do inference on a - video file. - - -**Table of contents:** - - -- `Imports <#imports>`__ - - - `Select inference device <#select-inference-device>`__ - - `Models for PaddleOCR <#models-for-paddleocr>`__ - - - `Download the Model for Text - Detection <#download-the-model-for-text-detection>`__ - - `Load the Model for Text - Detection <#load-the-model-for-text-detection>`__ - - `Download the Model for Text - Recognition <#download-the-model-for-text-recognition>`__ - - `Load the Model for Text Recognition with Dynamic - Shape <#load-the-model-for-text-recognition-with-dynamic-shape>`__ - - - `Preprocessing Image Functions for Text Detection and - Recognition <#preprocessing-image-functions-for-text-detection-and-recognition>`__ - - `Postprocessing Image for Text - Detection <#postprocessing-image-for-text-detection>`__ - - `Main Processing Function for - PaddleOCR <#main-processing-function-for-paddleocr>`__ - -- `Run Live PaddleOCR with - OpenVINO <#run-live-paddleocr-with-openvino>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - %pip install -q "openvino>=2023.1.0" - import platform - - if platform.system() == "Windows": - %pip install -q "paddlepaddle>=2.5.1,<2.6.0" - else: - %pip install -q "paddlepaddle>=2.5.1" - %pip install -q "pyclipper>=1.2.1" "shapely>=1.7.1" opencv-python tqdm "Pillow>=11.0" - -Imports -------- - - - -.. code:: ipython3 - - import cv2 - import numpy as np - import paddle - import math - import time - import collections - from PIL import Image - from pathlib import Path - import tarfile - - import openvino as ov - from IPython import display - import copy - -.. code:: ipython3 - - # Import local modules - - if not Path("./notebook_utils.py").exists(): - # Fetch `notebook_utils` module - import requests - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - import notebook_utils as utils - import pre_post_processing as processing - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("paddle-ocr-webcam.ipynb") - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device = utils.device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO') - - - -Models for PaddleOCR -~~~~~~~~~~~~~~~~~~~~ - - - -PaddleOCR includes two parts of deep learning models, text detection and -text recognition. Pre-trained models used in the demo are downloaded and -stored in the “model” folder. - -Only a few lines of code are required to run the model. First, -initialize the runtime for inference. Then, read the network -architecture and model weights from the ``.pdmodel`` and ``.pdiparams`` -files to load to CPU/GPU. - -.. code:: ipython3 - - # Define the function to download text detection and recognition models from PaddleOCR resources. - - - def run_model_download(model_url: str, model_file_path: Path) -> None: - """ - Download pre-trained models from PaddleOCR resources - - Parameters: - model_url: url link to pre-trained models - model_file_path: file path to store the downloaded model - """ - archive_path = model_file_path.absolute().parent.parent / model_url.split("/")[-1] - if model_file_path.is_file(): - print("Model already exists") - else: - # Download the model from the server, and untar it. - print("Downloading the pre-trained model... May take a while...") - - # Create a directory. - utils.download_file(model_url, archive_path.name, archive_path.parent) - print("Model Downloaded") - - file = tarfile.open(archive_path) - res = file.extractall(archive_path.parent) - file.close() - if not res: - print(f"Model Extracted to {model_file_path}.") - else: - print("Error Extracting the model. Please check the network.") - -Download the Model for Text **Detection** -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -.. code:: ipython3 - - # A directory where the model will be downloaded. - - det_model_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/paddle-ocr/ch_PP-OCRv3_det_infer.tar" - det_model_file_path = Path("model/ch_PP-OCRv3_det_infer/inference.pdmodel") - - run_model_download(det_model_url, det_model_file_path) - - -.. parsed-literal:: - - Downloading the pre-trained model... May take a while... - - - -.. parsed-literal:: - - /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/paddle-ocr-webcam/model/ch_PP-OCRv3_de… - - -.. parsed-literal:: - - Model Downloaded - Model Extracted to model/ch_PP-OCRv3_det_infer/inference.pdmodel. - - -Load the Model for Text **Detection** -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -.. code:: ipython3 - - # Initialize OpenVINO Runtime for text detection. - core = ov.Core() - det_model = core.read_model(model=det_model_file_path) - det_compiled_model = core.compile_model(model=det_model, device_name=device.value) - - # Get input and output nodes for text detection. - det_input_layer = det_compiled_model.input(0) - det_output_layer = det_compiled_model.output(0) - -Download the Model for Text **Recognition** -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -.. code:: ipython3 - - rec_model_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/paddle-ocr/ch_PP-OCRv3_rec_infer.tar" - rec_model_file_path = Path("model/ch_PP-OCRv3_rec_infer/inference.pdmodel") - - run_model_download(rec_model_url, rec_model_file_path) - - -.. parsed-literal:: - - Downloading the pre-trained model... May take a while... - - - -.. parsed-literal:: - - /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/paddle-ocr-webcam/model/ch_PP-OCRv3_re… - - -.. parsed-literal:: - - Model Downloaded - Model Extracted to model/ch_PP-OCRv3_rec_infer/inference.pdmodel. - - -Load the Model for Text **Recognition** with Dynamic Shape -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -Input to text recognition model refers to detected bounding boxes with -different image sizes, for example, dynamic input shapes. Hence: - -1. Input dimension with dynamic input shapes needs to be specified - before loading text recognition model. -2. Dynamic shape is specified by assigning -1 to the input dimension or - by setting the upper bound of the input dimension using, for example, - ``Dimension(1, 512)``. - -.. code:: ipython3 - - # Read the model and corresponding weights from a file. - rec_model = core.read_model(model=rec_model_file_path) - - # Assign dynamic shapes to every input layer on the last dimension. - for input_layer in rec_model.inputs: - input_shape = input_layer.partial_shape - input_shape[3] = -1 - rec_model.reshape({input_layer: input_shape}) - - rec_compiled_model = core.compile_model(model=rec_model, device_name="AUTO") - - # Get input and output nodes. - rec_input_layer = rec_compiled_model.input(0) - rec_output_layer = rec_compiled_model.output(0) - -Preprocessing Image Functions for Text Detection and Recognition -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Define preprocessing functions for text detection and recognition: 1. -Preprocessing for text detection: resize and normalize input images. 2. -Preprocessing for text recognition: resize and normalize detected box -images to the same size (for example, ``(3, 32, 320)`` size for images -with Chinese text) for easy batching in inference. - -.. code:: ipython3 - - # Preprocess for text detection. - def image_preprocess(input_image, size): - """ - Preprocess input image for text detection - - Parameters: - input_image: input image - size: value for the image to be resized for text detection model - """ - img = cv2.resize(input_image, (size, size)) - img = np.transpose(img, [2, 0, 1]) / 255 - img = np.expand_dims(img, 0) - # NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} - img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)) - img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)) - img -= img_mean - img /= img_std - return img.astype(np.float32) - -.. code:: ipython3 - - # Preprocess for text recognition. - def resize_norm_img(img, max_wh_ratio): - """ - Resize input image for text recognition - - Parameters: - img: bounding box image from text detection - max_wh_ratio: value for the resizing for text recognition model - """ - rec_image_shape = [3, 48, 320] - imgC, imgH, imgW = rec_image_shape - assert imgC == img.shape[2] - character_type = "ch" - if character_type == "ch": - imgW = int((32 * max_wh_ratio)) - h, w = img.shape[:2] - ratio = w / float(h) - if math.ceil(imgH * ratio) > imgW: - resized_w = imgW - else: - resized_w = int(math.ceil(imgH * ratio)) - resized_image = cv2.resize(img, (resized_w, imgH)) - resized_image = resized_image.astype("float32") - resized_image = resized_image.transpose((2, 0, 1)) / 255 - resized_image -= 0.5 - resized_image /= 0.5 - padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) - padding_im[:, :, 0:resized_w] = resized_image - return padding_im - - - def prep_for_rec(dt_boxes, frame): - """ - Preprocessing of the detected bounding boxes for text recognition - - Parameters: - dt_boxes: detected bounding boxes from text detection - frame: original input frame - """ - ori_im = frame.copy() - img_crop_list = [] - for bno in range(len(dt_boxes)): - tmp_box = copy.deepcopy(dt_boxes[bno]) - img_crop = processing.get_rotate_crop_image(ori_im, tmp_box) - img_crop_list.append(img_crop) - - img_num = len(img_crop_list) - # Calculate the aspect ratio of all text bars. - width_list = [] - for img in img_crop_list: - width_list.append(img.shape[1] / float(img.shape[0])) - - # Sorting can speed up the recognition process. - indices = np.argsort(np.array(width_list)) - return img_crop_list, img_num, indices - - - def batch_text_box(img_crop_list, img_num, indices, beg_img_no, batch_num): - """ - Batch for text recognition - - Parameters: - img_crop_list: processed detected bounding box images - img_num: number of bounding boxes from text detection - indices: sorting for bounding boxes to speed up text recognition - beg_img_no: the beginning number of bounding boxes for each batch of text recognition inference - batch_num: number of images for each batch - """ - norm_img_batch = [] - max_wh_ratio = 0 - end_img_no = min(img_num, beg_img_no + batch_num) - for ino in range(beg_img_no, end_img_no): - h, w = img_crop_list[indices[ino]].shape[0:2] - wh_ratio = w * 1.0 / h - max_wh_ratio = max(max_wh_ratio, wh_ratio) - for ino in range(beg_img_no, end_img_no): - norm_img = resize_norm_img(img_crop_list[indices[ino]], max_wh_ratio) - norm_img = norm_img[np.newaxis, :] - norm_img_batch.append(norm_img) - - norm_img_batch = np.concatenate(norm_img_batch) - norm_img_batch = norm_img_batch.copy() - return norm_img_batch - -Postprocessing Image for Text Detection -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - def post_processing_detection(frame, det_results): - """ - Postprocess the results from text detection into bounding boxes - - Parameters: - frame: input image - det_results: inference results from text detection model - """ - ori_im = frame.copy() - data = {"image": frame} - data_resize = processing.DetResizeForTest(data) - data_list = [] - keep_keys = ["image", "shape"] - for key in keep_keys: - data_list.append(data_resize[key]) - img, shape_list = data_list - - shape_list = np.expand_dims(shape_list, axis=0) - pred = det_results[0] - if isinstance(pred, paddle.Tensor): - pred = pred.numpy() - segmentation = pred > 0.3 - - boxes_batch = [] - for batch_index in range(pred.shape[0]): - src_h, src_w, ratio_h, ratio_w = shape_list[batch_index] - mask = segmentation[batch_index] - boxes, scores = processing.boxes_from_bitmap(pred[batch_index], mask, src_w, src_h) - boxes_batch.append({"points": boxes}) - post_result = boxes_batch - dt_boxes = post_result[0]["points"] - dt_boxes = processing.filter_tag_det_res(dt_boxes, ori_im.shape) - return dt_boxes - -Main Processing Function for PaddleOCR -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Run ``paddleOCR`` function in different operations, either a webcam or a -video file. See the list of procedures below: - -1. Create a video player to play with target fps - (``utils.VideoPlayer``). -2. Prepare a set of frames for text detection and recognition. -3. Run AI inference for both text detection and recognition. -4. Visualize the results. - -.. code:: ipython3 - - # Download font and a character dictionary for printing OCR results. - font_path = Path("fonts/simfang.ttf") - character_dictionary_path = Path("fonts/ppocr_keys_v1.txt") - - if not font_path.exists(): - utils.download_file( - url="https://raw.githubusercontent.com/Halfish/lstm-ctc-ocr/master/fonts/simfang.ttf", - directory="fonts", - ) - if not character_dictionary_path.exists(): - utils.download_file( - url="https://raw.githubusercontent.com/WenmuZhou/PytorchOCR/master/torchocr/datasets/alphabets/ppocr_keys_v1.txt", - directory="fonts", - ) - - - -.. parsed-literal:: - - fonts/simfang.ttf: 0%| | 0.00/10.1M [00:00 200: - processing_times.popleft() - processing_time_det = np.mean(processing_times) * 1000 - - # Preprocess detection results for recognition. - dt_boxes = processing.sorted_boxes(dt_boxes) - batch_num = 6 - img_crop_list, img_num, indices = prep_for_rec(dt_boxes, frame) - - # For storing recognition results, include two parts: - # txts are the recognized text results, scores are the recognition confidence level. - rec_res = [["", 0.0]] * img_num - txts = [] - scores = [] - - for beg_img_no in range(0, img_num, batch_num): - # Recognition starts from here. - norm_img_batch = batch_text_box(img_crop_list, img_num, indices, beg_img_no, batch_num) - - # Run inference for text recognition. - rec_results = rec_compiled_model([norm_img_batch])[rec_output_layer] - - # Postprocessing recognition results. - postprocess_op = processing.build_post_process(processing.postprocess_params) - rec_result = postprocess_op(rec_results) - for rno in range(len(rec_result)): - rec_res[indices[beg_img_no + rno]] = rec_result[rno] - if rec_res: - txts = [rec_res[i][0] for i in range(len(rec_res))] - scores = [rec_res[i][1] for i in range(len(rec_res))] - - image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) - boxes = dt_boxes - # Draw text recognition results beside the image. - draw_img = processing.draw_ocr_box_txt(image, boxes, txts, scores, drop_score=0.5, font_path=str(font_path)) - - # Visualize the PaddleOCR results. - f_height, f_width = draw_img.shape[:2] - fps = 1000 / processing_time_det - cv2.putText( - img=draw_img, - text=f"Inference time: {processing_time_det:.1f}ms ({fps:.1f} FPS)", - org=(20, 40), - fontFace=cv2.FONT_HERSHEY_COMPLEX, - fontScale=f_width / 1000, - color=(0, 0, 255), - thickness=1, - lineType=cv2.LINE_AA, - ) - - # Use this workaround if there is flickering. - if use_popup: - draw_img = cv2.cvtColor(draw_img, cv2.COLOR_RGB2BGR) - cv2.imshow(winname=title, mat=draw_img) - key = cv2.waitKey(1) - # escape = 27 - if key == 27: - break - else: - # Encode numpy array to jpg. - draw_img = cv2.cvtColor(draw_img, cv2.COLOR_RGB2BGR) - _, encoded_img = cv2.imencode(ext=".jpg", img=draw_img, params=[cv2.IMWRITE_JPEG_QUALITY, 100]) - # Create an IPython image. - i = display.Image(data=encoded_img) - # Display the image in this notebook. - display.clear_output(wait=True) - display.display(i) - - # ctrl-c - except KeyboardInterrupt: - print("Interrupted") - # any different error - except RuntimeError as e: - print(e) - finally: - if player is not None: - # Stop capturing. - player.stop() - if use_popup: - cv2.destroyAllWindows() - -Run Live PaddleOCR with OpenVINO --------------------------------- - - - -Use a webcam as the video input. By default, the primary webcam is set -with ``source=0``. If you have multiple webcams, each one will be -assigned a consecutive number starting at 0. Set ``flip=True`` when -using a front-facing camera. Some web browsers, especially Mozilla -Firefox, may cause flickering. If you experience flickering, set -``use_popup=True``. - - **NOTE**: Popup mode may not work if you run this notebook on a - remote computer. - -If you do not have a webcam, you can still run this demo with a video -file. Any `format supported by -OpenCV `__ -will work. - -Run live PaddleOCR: - -.. code:: ipython3 - - USE_WEBCAM = False - - cam_id = 0 - video_file = Path("test.mp4") - - if not USE_WEBCAM and not video_file.exists(): - utils.download_file("https://raw.githubusercontent.com/yoyowz/classification/master/images/test.mp4") - - source = cam_id if USE_WEBCAM else video_file - - run_paddle_ocr(source, flip=False, use_popup=False) diff --git a/docs/notebooks/paddle-to-openvino-classification-with-output.rst b/docs/notebooks/paddle-to-openvino-classification-with-output.rst deleted file mode 100644 index 18ba44d182d334..00000000000000 --- a/docs/notebooks/paddle-to-openvino-classification-with-output.rst +++ /dev/null @@ -1,505 +0,0 @@ -Convert a PaddlePaddle Model to OpenVINO™ IR -============================================ - -This notebook shows how to convert a MobileNetV3 model from -`PaddleHub `__, pre-trained -on the `ImageNet `__ dataset, to OpenVINO IR. -It also shows how to perform classification inference on a sample image, -using `OpenVINO -Runtime `__ -and compares the results of the -`PaddlePaddle `__ model with the -IR model. - -Source of the -`model `__. - - -**Table of contents:** - - -- `Preparation <#preparation>`__ - - - `Imports <#imports>`__ - - `Settings <#settings>`__ - -- `Show Inference on PaddlePaddle - Model <#show-inference-on-paddlepaddle-model>`__ -- `Convert the Model to OpenVINO IR - Format <#convert-the-model-to-openvino-ir-format>`__ -- `Select inference device <#select-inference-device>`__ -- `Show Inference on OpenVINO - Model <#show-inference-on-openvino-model>`__ -- `Timing and Comparison <#timing-and-comparison>`__ -- `Select inference device <#select-inference-device>`__ -- `References <#references>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Preparation ------------ - - - -Imports -~~~~~~~ - - - -.. code:: ipython3 - - %pip install -q "paddlepaddle>=2.5.1,<2.6.0" - %pip install -q "paddleclas>=2.5.2" --no-deps - %pip install -q "prettytable" "ujson" "visualdl>=2.5.3" "faiss-cpu>=1.7.1" Pillow tqdm "matplotlib>=3.4" "opencv-python" "scikit-learn" - # Install openvino package - %pip install -q "openvino>=2023.1.0" - -.. code:: ipython3 - - import time - import tarfile - from pathlib import Path - - import matplotlib.pyplot as plt - import numpy as np - import openvino as ov - from paddleclas import PaddleClas - from PIL import Image - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("paddle-to-openvino-classification.ipynb") - -Settings -~~~~~~~~ - - - -Set ``IMAGE_FILENAME`` to the filename of an image to use. Set -``MODEL_NAME`` to the PaddlePaddle model to download from PaddleHub. -``MODEL_NAME`` will also be the base name for the IR model. The notebook -is tested with the -`MobileNetV3_large_x1_0 `__ -model. Other models may use different preprocessing methods and -therefore require some modification to get the same results on the -original and converted model. - -First of all, we need to download and unpack model files. The first time -you run this notebook, the PaddlePaddle model is downloaded from -PaddleHub. This may take a while. - -.. code:: ipython3 - - # Download the image from the openvino_notebooks storage - img = Path("data/coco_close.png") - - if not img.exists(): - download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_close.png", - directory="data", - ) - - IMAGE_FILENAME = img.as_posix() - - MODEL_NAME = "MobileNetV3_large_x1_0" - MODEL_DIR = Path("model") - if not MODEL_DIR.exists(): - MODEL_DIR.mkdir() - - if not (MODEL_DIR / "{}_infer".format(MODEL_NAME)).exists(): - MODEL_URL = "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/{}_infer.tar".format(MODEL_NAME) - download_file(MODEL_URL, directory=MODEL_DIR) - file = tarfile.open(MODEL_DIR / "{}_infer.tar".format(MODEL_NAME)) - res = file.extractall(MODEL_DIR) - if not res: - print(f'Model Extracted to "./{MODEL_DIR}".') - else: - print("Error Extracting the model. Please check the network.") - - -.. parsed-literal:: - - 'data/coco_close.png' already exists. - 'model/MobileNetV3_large_x1_0_infer.tar' already exists. - Model Extracted to "./model". - - -Show Inference on PaddlePaddle Model ------------------------------------- - - - -In the next cell, we load the model, load and display an image, do -inference on that image, and then show the top three prediction results. - -.. code:: ipython3 - - classifier = PaddleClas(inference_model_dir=MODEL_DIR / "{}_infer".format(MODEL_NAME)) - result = next(classifier.predict(IMAGE_FILENAME)) - class_names = result[0]["label_names"] - scores = result[0]["scores"] - image = Image.open(IMAGE_FILENAME) - plt.imshow(image) - for class_name, softmax_probability in zip(class_names, scores): - print(f"{class_name}, {softmax_probability:.5f}") - - -.. parsed-literal:: - - [2024/11/14 13:49:26] ppcls WARNING: The current running environment does not support the use of GPU. CPU has been used instead. - Labrador retriever, 0.75138 - German short-haired pointer, 0.02373 - Great Dane, 0.01848 - Rottweiler, 0.01435 - flat-coated retriever, 0.01144 - - - -.. image:: paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_7_1.png - - -``classifier.predict()`` takes an image file name, reads the image, -preprocesses the input, then returns the class labels and scores of the -image. Preprocessing the image is done behind the scenes. The -classification model returns an array with floating point values for -each of the 1000 ImageNet classes. The higher the value, the more -confident the network is that the class number corresponding to that -value (the index of that value in the network output array) is the class -number for the image. - -To see PaddlePaddle’s implementation for the classification function and -for loading and preprocessing data, uncomment the next two cells. - -.. code:: ipython3 - - # classifier?? - -.. code:: ipython3 - - # classifier.get_config() - -The ``classifier.get_config()`` module shows the preprocessing -configuration for the model. It should show that images are normalized, -resized and cropped, and that the BGR image is converted to RGB before -propagating it through the network. In the next cell, we get the -``classifier.predictror.preprocess_ops`` property that returns list of -preprocessing operations to do inference on the OpenVINO IR model using -the same method. - -.. code:: ipython3 - - preprocess_ops = classifier.predictor.preprocess_ops - - - def process_image(image): - for op in preprocess_ops: - image = op(image) - return image - -It is useful to show the output of the ``process_image()`` function, to -see the effect of cropping and resizing. Because of the normalization, -the colors will look strange, and ``matplotlib`` will warn about -clipping values. - -.. code:: ipython3 - - pil_image = Image.open(IMAGE_FILENAME) - processed_image = process_image(np.array(pil_image)) - print(f"Processed image shape: {processed_image.shape}") - # Processed image is in (C,H,W) format, convert to (H,W,C) to show the image - plt.imshow(np.transpose(processed_image, (1, 2, 0))) - - -.. parsed-literal:: - - [2024-11-14 13:49:27,391] [ WARNING] image.py:705 - Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Got range [-2.117904..2.640001]. - - -.. parsed-literal:: - - Processed image shape: (3, 224, 224) - - - - -.. parsed-literal:: - - - - - - -.. image:: paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_14_3.png - - -To decode the labels predicted by the model to names of classes, we need -to have a mapping between them. The model config contains information -about ``class_id_map_file``, which stores such mapping. The code below -shows how to parse the mapping into a dictionary to use with the -OpenVINO model. - -.. code:: ipython3 - - class_id_map_file = classifier.get_config()["PostProcess"]["Topk"]["class_id_map_file"] - class_id_map = {} - with open(class_id_map_file, "r") as fin: - lines = fin.readlines() - for line in lines: - partition = line.split("\n")[0].partition(" ") - class_id_map[int(partition[0])] = str(partition[-1]) - -Convert the Model to OpenVINO IR Format ---------------------------------------- - - - -Call the OpenVINO Model Conversion API to convert the PaddlePaddle model -to OpenVINO IR, with FP32 precision. ``ov.convert_model`` function -accept path to PaddlePaddle model and returns OpenVINO Model class -instance which represents this model. Obtained model is ready to use and -loading on device using ``ov.compile_model`` or can be saved on disk -using ``ov.save_model`` function. See the `Model Conversion -Guide `__ -for more information about the Model Conversion API. - -.. code:: ipython3 - - model_xml = Path(MODEL_NAME).with_suffix(".xml") - if not model_xml.exists(): - ov_model = ov.convert_model("model/MobileNetV3_large_x1_0_infer/inference.pdmodel") - ov.save_model(ov_model, str(model_xml)) - else: - print(f"{model_xml} already exists.") - -Select inference device ------------------------ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - core = ov.Core() - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Show Inference on OpenVINO Model --------------------------------- - - - -Load the IR model, get model information, load the image, do inference, -convert the inference to a meaningful result, and show the output. See -the `OpenVINO Runtime API -Notebook `__ for more information. - -.. code:: ipython3 - - # Load OpenVINO Runtime and OpenVINO IR model - core = ov.Core() - model = core.read_model(model_xml) - compiled_model = core.compile_model(model=model, device_name=device.value) - - # Get model output - output_layer = compiled_model.output(0) - - # Read, show, and preprocess input image - # See the "Show Inference on PaddlePaddle Model" section for source of process_image - image = Image.open(IMAGE_FILENAME) - plt.imshow(image) - input_image = process_image(np.array(image))[None,] - - # Do inference - ov_result = compiled_model([input_image])[output_layer][0] - - # find the top three values - top_indices = np.argsort(ov_result)[-3:][::-1] - top_scores = ov_result[top_indices] - - # Convert the inference results to class names, using the same labels as the PaddlePaddle classifier - for index, softmax_probability in zip(top_indices, top_scores): - print(f"{class_id_map[index]}, {softmax_probability:.5f}") - - -.. parsed-literal:: - - Labrador retriever, 0.74909 - German short-haired pointer, 0.02368 - Great Dane, 0.01873 - - - -.. image:: paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_22_1.png - - -Timing and Comparison ---------------------- - - - -Measure the time it takes to do inference on fifty images and compare -the result. The timing information gives an indication of performance. -For a fair comparison, we include the time it takes to process the -image. For more accurate benchmarking, use the `OpenVINO benchmark -tool `__. -Note that many optimizations are possible to improve the performance. - -.. code:: ipython3 - - num_images = 50 - - image = Image.open(fp=IMAGE_FILENAME) - -.. code:: ipython3 - - import openvino.properties as props - - - # Show device information - core = ov.Core() - devices = core.available_devices - - for device_name in devices: - device_full_name = core.get_property(device_name, props.device.full_name) - print(f"{device_name}: {device_full_name}") - - -.. parsed-literal:: - - CPU: Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz - - -.. code:: ipython3 - - # Show inference speed on PaddlePaddle model - start = time.perf_counter() - for _ in range(num_images): - result = next(classifier.predict(np.array(image))) - end = time.perf_counter() - time_ir = end - start - print(f"PaddlePaddle model on CPU: {time_ir/num_images:.4f} " f"seconds per image, FPS: {num_images/time_ir:.2f}\n") - print("PaddlePaddle result:") - class_names = result[0]["label_names"] - scores = result[0]["scores"] - for class_name, softmax_probability in zip(class_names, scores): - print(f"{class_name}, {softmax_probability:.5f}") - plt.imshow(image); - - -.. parsed-literal:: - - PaddlePaddle model on CPU: 0.0073 seconds per image, FPS: 137.05 - - PaddlePaddle result: - Labrador retriever, 0.75138 - German short-haired pointer, 0.02373 - Great Dane, 0.01848 - Rottweiler, 0.01435 - flat-coated retriever, 0.01144 - - - -.. image:: paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_26_1.png - - -Select inference device ------------------------ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - # Show inference speed on OpenVINO IR model - compiled_model = core.compile_model(model=model, device_name=device.value) - output_layer = compiled_model.output(0) - - - start = time.perf_counter() - input_image = process_image(np.array(image))[None,] - for _ in range(num_images): - ie_result = compiled_model([input_image])[output_layer][0] - top_indices = np.argsort(ie_result)[-5:][::-1] - top_softmax = ie_result[top_indices] - - end = time.perf_counter() - time_ir = end - start - - print(f"OpenVINO IR model in OpenVINO Runtime ({device.value}): {time_ir/num_images:.4f} " f"seconds per image, FPS: {num_images/time_ir:.2f}") - print() - print("OpenVINO result:") - for index, softmax_probability in zip(top_indices, top_softmax): - print(f"{class_id_map[index]}, {softmax_probability:.5f}") - plt.imshow(image); - - -.. parsed-literal:: - - OpenVINO IR model in OpenVINO Runtime (AUTO): 0.0028 seconds per image, FPS: 359.33 - - OpenVINO result: - Labrador retriever, 0.74909 - German short-haired pointer, 0.02368 - Great Dane, 0.01873 - Rottweiler, 0.01448 - flat-coated retriever, 0.01153 - - - -.. image:: paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_29_1.png - - -References ----------- - - - -- `PaddleClas `__ -- `OpenVINO PaddlePaddle - support `__ diff --git a/docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_14_3.png b/docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_14_3.png deleted file mode 100644 index 4975d9659a52f4..00000000000000 --- a/docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_14_3.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b27c41670fb077db0316b011253fc81b16643bb6ad6745b2ea6ec7ffda48ebd6 -size 120883 diff --git a/docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_22_1.png b/docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_22_1.png deleted file mode 100644 index ece8d3c1cc496a..00000000000000 --- a/docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_22_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:06968d7e7a935e0cc267cf7d0cfeb2fe419ad73ec78df2c366c4be4a854c2b30 -size 224886 diff --git a/docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_26_1.png b/docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_26_1.png deleted file mode 100644 index ece8d3c1cc496a..00000000000000 --- a/docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_26_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:06968d7e7a935e0cc267cf7d0cfeb2fe419ad73ec78df2c366c4be4a854c2b30 -size 224886 diff --git a/docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_29_1.png b/docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_29_1.png deleted file mode 100644 index ece8d3c1cc496a..00000000000000 --- a/docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_29_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:06968d7e7a935e0cc267cf7d0cfeb2fe419ad73ec78df2c366c4be4a854c2b30 -size 224886 diff --git a/docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_7_1.png b/docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_7_1.png deleted file mode 100644 index ece8d3c1cc496a..00000000000000 --- a/docs/notebooks/paddle-to-openvino-classification-with-output_files/paddle-to-openvino-classification-with-output_7_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:06968d7e7a935e0cc267cf7d0cfeb2fe419ad73ec78df2c366c4be4a854c2b30 -size 224886 diff --git a/docs/notebooks/parler-tts-text-to-speech-with-output.rst b/docs/notebooks/parler-tts-text-to-speech-with-output.rst deleted file mode 100644 index b1cb65487e5562..00000000000000 --- a/docs/notebooks/parler-tts-text-to-speech-with-output.rst +++ /dev/null @@ -1,424 +0,0 @@ -Text-to-speech (TTS) with Parler-TTS and OpenVINO -================================================= - -Parler-TTS is a lightweight text-to-speech (TTS) model that can generate -high-quality, natural sounding speech in the style of a given speaker -(gender, pitch, speaking style, etc). It is a reproduction of work from -the paper `Natural language guidance of high-fidelity text-to-speech -with synthetic -annotations `__ by Dan Lyth -and Simon King, from Stability AI and Edinburgh University respectively. - -.. image:: https://images.squarespace-cdn.com/content/v1/657816dfbefe0533e8a69d9a/30c96e25-acc5-4019-acdd-648da6142c4c/architecture_v3.png?format=2500w - -Text-to-speech models trained on large-scale datasets have demonstrated -impressive in-context learning capabilities and naturalness. However, -control of speaker identity and style in these models typically requires -conditioning on reference speech recordings, limiting creative -applications. Alternatively, natural language prompting of speaker -identity and style has demonstrated promising results and provides an -intuitive method of control. However, reliance on human-labeled -descriptions prevents scaling to large datasets. - -This work bridges the gap between these two approaches. The authors -propose a scalable method for labeling various aspects of speaker -identity, style, and recording conditions. This method then is applied -to a 45k hour dataset, which is used to train a speech language model. -Furthermore, the authors propose simple methods for increasing audio -fidelity, significantly outperforming recent work despite relying -entirely on found data. - -`GitHub repository `__ - -`HuggingFace page `__ - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Load the original model and - inference <#load-the-original-model-and-inference>`__ -- `Convert the model to OpenVINO - IR <#convert-the-model-to-openvino-ir>`__ -- `Compiling models and inference <#compiling-models-and-inference>`__ -- `Interactive inference <#interactive-inference>`__ - - - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import os - - os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - - %pip uninstall -q -y torch torchvision torchaudio - %pip install -q "openvino>=2024.2.0" - %pip install -q git+https://github.com/huggingface/parler-tts.git "gradio>=4.19" transformers "torch>=2.2" "torchaudio" --extra-index-url https://download.pytorch.org/whl/cpu - -Load the original model and inference -------------------------------------- - - - -.. code:: ipython3 - - import torch - from parler_tts import ParlerTTSForConditionalGeneration - from transformers import AutoTokenizer - import soundfile as sf - - device = "cpu" - - repo_id = "parler-tts/parler_tts_mini_v0.1" - model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device) - tokenizer = AutoTokenizer.from_pretrained(repo_id) - - prompt = "Hey, how are you doing today?" - description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast." - - input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device) - prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) - - generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids) - audio_arr = generation.cpu().numpy().squeeze() - sf.write("parler_tts_out.wav", audio_arr, model.config.sampling_rate) - -.. code:: ipython3 - - import IPython.display as ipd - - ipd.Audio("parler_tts_out.wav") - - - - -.. raw:: html - - - - - - - -Convert the model to OpenVINO IR --------------------------------- - - - -Let’s define the conversion function for PyTorch modules. We use -``ov.convert_model`` function to obtain OpenVINO Intermediate -Representation object and ``ov.save_model`` function to save it as XML -file. - -.. code:: ipython3 - - import openvino as ov - from pathlib import Path - - - def convert(model: torch.nn.Module, xml_path: str, example_input): - xml_path = Path(xml_path) - if not xml_path.exists(): - xml_path.parent.mkdir(parents=True, exist_ok=True) - with torch.no_grad(): - converted_model = ov.convert_model(model, example_input=example_input) - - ov.save_model(converted_model, xml_path) - - # cleanup memory - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - -In the pipeline two models are used: Text Encoder (``T5EncoderModel``) -and Decoder (``ParlerTTSDecoder``). Lets convert them one by one. - -.. code:: ipython3 - - TEXT_ENCODER_OV_PATH = Path("models/text_encoder_ir.xml") - - - example_input = { - "input_ids": torch.ones((1, 39), dtype=torch.int64), - } - - text_encoder_ov_model = convert(model.text_encoder, TEXT_ENCODER_OV_PATH, example_input) - -The Decoder Model performs in generation pipeline and we can separate it -into two stage. In the first stage the model generates -``past_key_values`` into output for the second stage. In the second -stage the model produces tokens during several runs. - -.. code:: ipython3 - - DECODER_STAGE_1_OV_PATH = Path("models/decoder_stage_1_ir.xml") - - - class DecoderStage1Wrapper(torch.nn.Module): - def __init__(self, decoder): - super().__init__() - self.decoder = decoder - - def forward(self, input_ids=None, encoder_hidden_states=None, encoder_attention_mask=None, prompt_hidden_states=None): - return self.decoder( - input_ids=input_ids, - return_dict=False, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, - prompt_hidden_states=prompt_hidden_states, - ) - - - example_input = { - "input_ids": torch.ones((9, 1), dtype=torch.int64), - "encoder_hidden_states": torch.ones((1, 39, 1024), dtype=torch.float32), - "encoder_attention_mask": torch.ones((1, 39), dtype=torch.int64), - "prompt_hidden_states": torch.ones((1, 9, 1024), dtype=torch.float32), - } - - decoder_1_ov_model = convert(DecoderStage1Wrapper(model.decoder.model.decoder), DECODER_STAGE_1_OV_PATH, example_input) - -.. code:: ipython3 - - DECODER_STAGE_2_OV_PATH = Path("models/decoder_stage_2_ir.xml") - - - class DecoderStage2Wrapper(torch.nn.Module): - def __init__(self, decoder): - super().__init__() - self.decoder = decoder - - def forward(self, input_ids=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None): - past_key_values = tuple(tuple(past_key_values[i : i + 4]) for i in range(0, len(past_key_values), 4)) - return self.decoder( - input_ids=input_ids, - return_dict=False, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, - past_key_values=past_key_values, - ) - - - example_input = { - "input_ids": torch.ones((9, 1), dtype=torch.int64), - "encoder_hidden_states": torch.ones((1, 39, 1024), dtype=torch.float32), - "encoder_attention_mask": torch.ones((1, 39), dtype=torch.int64), - "past_key_values": ( - ( - torch.ones(1, 16, 10, 64, dtype=torch.float32), - torch.ones(1, 16, 10, 64, dtype=torch.float32), - torch.ones(1, 16, 39, 64, dtype=torch.float32), - torch.ones(1, 16, 39, 64, dtype=torch.float32), - ) - * 24 - ), - } - - decoder_2_ov_model = convert(DecoderStage2Wrapper(model.decoder.model.decoder), DECODER_STAGE_2_OV_PATH, example_input) - -Compiling models and inference ------------------------------- - - - -Select device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import device_widget - - device = device_widget() - - device - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("parler-tts-text-to-speech.ipynb") - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=4, options=('CPU', 'GPU.0', 'GPU.1', 'GPU.2', 'AUTO'), value='AUTO') - - - -Let’s create callable wrapper classes for compiled models to allow -interaction with original pipeline. Note that all of wrapper classes -return ``torch.Tensor``\ s instead of ``np.array``\ s. In the -``DecoderWrapper`` we separates the pipeline into two stages. - -.. code:: ipython3 - - from collections import namedtuple - - import torch.nn as nn - - EncoderOutput = namedtuple("EncoderOutput", "last_hidden_state") - DecoderOutput = namedtuple("DecoderOutput", ("last_hidden_state", "past_key_values", "hidden_states", "attentions", "cross_attentions")) - - core = ov.Core() - - - class TextEncoderModelWrapper(torch.nn.Module): - def __init__(self, encoder_ir_path, config): - ov_config = {} - if "GPU" in device.value: - ov_config = {"INFERENCE_PRECISION_HINT": "f32"} - self.encoder = core.compile_model(encoder_ir_path, device.value, ov_config) - self.config = config - self.dtype = self.config.torch_dtype - - def __call__(self, input_ids, **_): - last_hidden_state = self.encoder(input_ids)[0] - return EncoderOutput(torch.from_numpy(last_hidden_state)) - - - class DecoderWrapper(torch.nn.Module): - def __init__(self, decoder_stage_1_ir_path, decoder_stage_2_ir_path, config): - super().__init__() - self.decoder_stage_1 = core.compile_model(decoder_stage_1_ir_path, device.value) - self.decoder_stage_2 = core.compile_model(decoder_stage_2_ir_path, device.value) - self.config = config - self.embed_tokens = None - embed_dim = config.vocab_size + 1 # + 1 for pad token id - self.embed_tokens = nn.ModuleList([nn.Embedding(embed_dim, config.hidden_size) for _ in range(config.num_codebooks)]) - - def __call__(self, input_ids=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, prompt_hidden_states=None, **kwargs): - inputs = {} - if input_ids is not None: - inputs["input_ids"] = input_ids - if encoder_hidden_states is not None: - inputs["encoder_hidden_states"] = encoder_hidden_states - if encoder_attention_mask is not None: - inputs["encoder_attention_mask"] = encoder_attention_mask - if prompt_hidden_states is not None: - inputs["prompt_hidden_states"] = prompt_hidden_states - if past_key_values is not None: - past_key_values = tuple(past_key_value for pkv_per_layer in past_key_values for past_key_value in pkv_per_layer) - inputs["past_key_values"] = past_key_values - arguments = ( - input_ids, - encoder_hidden_states, - encoder_attention_mask, - *past_key_values, - ) - outs = self.decoder_stage_2(arguments) - else: - outs = self.decoder_stage_1(inputs) - - outs = [torch.from_numpy(out) for out in outs.values()] - past_key_values = list(list(outs[i : i + 4]) for i in range(1, len(outs), 4)) - - return DecoderOutput(outs[0], past_key_values, None, None, None) - -Now we can replace the original models by our wrapped OpenVINO models -and run inference. - -.. code:: ipython3 - - model.text_encoder = TextEncoderModelWrapper(TEXT_ENCODER_OV_PATH, model.text_encoder.config) - model.decoder.model.decoder = DecoderWrapper(DECODER_STAGE_1_OV_PATH, DECODER_STAGE_2_OV_PATH, model.decoder.model.decoder.config) - model._supports_cache_class = False - model._supports_static_cache = False - -.. code:: ipython3 - - generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids) - audio_arr = generation.cpu().numpy().squeeze() - sf.write("parler_tts_out.wav", audio_arr, model.config.sampling_rate) - -.. code:: ipython3 - - import IPython.display as ipd - - ipd.Audio("parler_tts_out.wav") - - - - -.. raw:: html - - - - - - - -Interactive inference ---------------------- - - - -.. code:: ipython3 - - from transformers import AutoFeatureExtractor, set_seed - - - feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id) - SAMPLE_RATE = feature_extractor.sampling_rate - - - def infer(prompt, description, seed): - set_seed(seed) - - input_ids = tokenizer(description, return_tensors="pt").input_ids.to("cpu") - prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cpu") - - generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids) - audio_arr = generation.cpu().numpy().squeeze() - sr = SAMPLE_RATE - - return sr, audio_arr - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/parler-tts-text-to-speech/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=infer) - - try: - demo.queue().launch(debug=True) - except Exception: - demo.queue().launch(share=True, debug=True) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ - -.. code:: ipython3 - - # please uncomment and run this cell for stopping gradio interface - # demo.close() diff --git a/docs/notebooks/person-counting-with-output.rst b/docs/notebooks/person-counting-with-output.rst deleted file mode 100644 index c9e4a67618dd9e..00000000000000 --- a/docs/notebooks/person-counting-with-output.rst +++ /dev/null @@ -1,277 +0,0 @@ -Person Counting System using YOLOV8 and OpenVINO™ -================================================= - -In this project, we utilized YOLOV8 Object Counting class to develop a -real-time people counting system using the YOLOv8 object detection -model, optimized for Intel’s OpenVINO toolkit to enhance inferencing -speed. This system effectively monitors the number of individuals -entering and exiting a room, leveraging the optimized YOLOv8 model for -accurate people detection under varied conditions. - -By utilizing the OpenVINO runtime on Intel hardware, the system achieves -significant improvements in processing speed, making it ideal for -applications requiring real-time data, such as occupancy management and -traffic flow control in public spaces and commercial settings. - - **NOTE**: To use this notebook with a webcam, you need to run the - notebook on a computer with a webcam. If you run the notebook on a - server, the webcam will not work. However, you can still do inference - on a video. - - -**Table of contents:** - - -- `Install pre-requisites <#install-pre-requisites>`__ -- `Download Model <#download-model>`__ -- `Inference function <#inference-function>`__ -- `Run live demo <#run-live-demo>`__ -- `Select inference device <#select-inference-device>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Install pre-requisites ----------------------- - - - -.. code:: ipython3 - - %pip uninstall -q -y "openvino-python-headless" "opencv-python" - %pip install -q "ultralytics==8.3.59" "torch>=2.1" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "openvino>=2024.0.0" "opencv-python" "lap>=0.4" "shapely" - -Download Model --------------- - -Download and convert YOLOV8 to OpenVINO Intermediate Representation - - - -.. code:: ipython3 - - from pathlib import Path - from ultralytics import YOLO - - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("person-counting.ipynb") - - models_dir = Path("./models") - models_dir.mkdir(exist_ok=True) - - DET_MODEL_NAME = "yolov8n" - - det_model = YOLO(models_dir / f"{DET_MODEL_NAME}.pt") - label_map = det_model.model.names - - # Need to make en empty call to initialize the model - res = det_model() - det_model_path = models_dir / f"{DET_MODEL_NAME}_openvino_model/{DET_MODEL_NAME}.xml" - if not det_model_path.exists(): - det_model.export(format="openvino", dynamic=True, half=True) - - -.. parsed-literal:: - - WARNING ⚠️ 'source' is missing. Using 'source=/home/ea/work/notebooks_env/lib/python3.8/site-packages/ultralytics/assets'. - - image 1/2 /home/ea/work/notebooks_env/lib/python3.8/site-packages/ultralytics/assets/bus.jpg: 640x480 4 persons, 1 bus, 1 stop sign, 110.2ms - image 2/2 /home/ea/work/notebooks_env/lib/python3.8/site-packages/ultralytics/assets/zidane.jpg: 384x640 2 persons, 1 tie, 83.6ms - Speed: 2.4ms preprocess, 96.9ms inference, 248.0ms postprocess per image at shape (1, 3, 384, 640) - - -Inference function ------------------- - - - -.. code:: ipython3 - - from ultralytics import YOLO - from ultralytics.solutions import ObjectCounter - import cv2 - import time - import collections - import numpy as np - from IPython import display - import torch - import openvino as ov - - - def run_inference(source, device): - core = ov.Core() - - det_ov_model = core.read_model(det_model_path) - ov_config = {} - - if device.value != "CPU": - det_ov_model.reshape({0: [1, 3, 640, 640]}) - if "GPU" in device.value or ("AUTO" in device.value and "GPU" in core.available_devices): - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - compiled_model = core.compile_model(det_ov_model, device.value, ov_config) - - def infer(*args): - result = compiled_model(args) - return torch.from_numpy(result[0]) - - # Use openVINO as inference engine - det_model.predictor.inference = infer - det_model.predictor.model.pt = False - - try: - cap = cv2.VideoCapture(source) - assert cap.isOpened(), "Error reading video file" - - line_points = [(0, 300), (1080, 300)] # line or region points - - # Init Object Counter - counter = ObjectCounter(show=False, region=line_points, model=det_model_path.parent, line_width=2, show_in=False, show_out=False) - compiled_model.track = counter.model.track - counter.model = compiled_model - # Processing time - processing_times = collections.deque(maxlen=200) - - while cap.isOpened(): - success, frame = cap.read() - if not success: - print("Video frame is empty or video processing has been successfully completed.") - break - - start_time = time.time() - frame = counter.count(frame) - stop_time = time.time() - - processing_times.append(stop_time - start_time) - - # Mean processing time [ms]. - _, f_width = frame.shape[:2] - processing_time = np.mean(processing_times) * 1000 - fps = 1000 / processing_time - cv2.putText( - img=frame, - text=f"Inference time: {processing_time:.1f}ms ({fps:.1f} FPS)", - org=(20, 40), - fontFace=cv2.FONT_HERSHEY_COMPLEX, - fontScale=f_width / 1000, - color=(0, 0, 255), - thickness=2, - lineType=cv2.LINE_AA, - ) - - # Get the counts. Counts are getting as 'OUT' - # Modify this logic accordingly - counts = counter.out_count - - # Define the text to display - text = f"Count: {counts}" - fontFace = cv2.FONT_HERSHEY_COMPLEX - fontScale = 0.75 # Adjust scale as needed - thickness = 2 - - # Calculate the size of the text box - (text_width, text_height), _ = cv2.getTextSize(text, fontFace, fontScale, thickness) - - # Define the upper right corner for the text - top_right_corner = (frame.shape[1] - text_width - 20, 40) - # Draw the count of "OUT" on the frame - cv2.putText( - img=frame, - text=text, - org=(top_right_corner[0], top_right_corner[1]), - fontFace=fontFace, - fontScale=fontScale, - color=(0, 0, 255), - thickness=thickness, - lineType=cv2.LINE_AA, - ) - - # Show the frame - _, encoded_img = cv2.imencode(ext=".jpg", img=frame, params=[cv2.IMWRITE_JPEG_QUALITY, 100]) - # Create an IPython image. - i = display.Image(data=encoded_img) - # Display the image in this notebook. - display.clear_output(wait=True) - display.display(i) - except KeyboardInterrupt: - print("Interrupted") - - cap.release() - cv2.destroyAllWindows() - -Run live demo -------------- - - - -.. code:: ipython3 - - from notebook_utils import download_file - - WEBCAM_INFERENCE = False - - if WEBCAM_INFERENCE: - VIDEO_SOURCE = 0 # Webcam - else: - VIDEO_SOURCE = Path("people-detection.mp4") - if not VIDEO_SOURCE.exists(): - download_file("https://storage.openvinotoolkit.org/data/test_data/videos/people-detection.mp4") - - **NOTE**: make sure to restart kernel and run all cells when - switching between video and webcam to avoid any errors. - -Select inference device ------------------------ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - run_inference( - source=VIDEO_SOURCE, - device=device, - ) - - - -.. image:: person-counting-with-output_files/person-counting-with-output_14_0.png - - -.. parsed-literal:: - - Video frame is empty or video processing has been successfully completed. - diff --git a/docs/notebooks/person-counting-with-output_files/person-counting-with-output_14_0.png b/docs/notebooks/person-counting-with-output_files/person-counting-with-output_14_0.png deleted file mode 100644 index 8c185cb866bd73..00000000000000 --- a/docs/notebooks/person-counting-with-output_files/person-counting-with-output_14_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eaae17e97804b1e553b79a0b7a352ff59a02d7391f40a1b3fe6f021a84a537be -size 155816 diff --git a/docs/notebooks/person-tracking-with-output.rst b/docs/notebooks/person-tracking-with-output.rst deleted file mode 100644 index 1065593eacdbcf..00000000000000 --- a/docs/notebooks/person-tracking-with-output.rst +++ /dev/null @@ -1,687 +0,0 @@ -Person Tracking with OpenVINO™ -============================== - -This notebook demonstrates live person tracking with OpenVINO: it reads -frames from an input video sequence, detects people in the frames, -uniquely identifies each one of them and tracks all of them until they -leave the frame. We will use the `Deep -SORT `__ algorithm to perform object -tracking, an extension to SORT (Simple Online and Realtime Tracking). - -Detection vs Tracking ---------------------- - -- In object detection, we detect an object in a frame, put a bounding - box or a mask around it, and classify the object. Note that, the job - of the detector ends here. It processes each frame independently and - identifies numerous objects in that particular frame. -- An object tracker on the other hand needs to track a particular - object across the entire video. If the detector detects three cars in - the frame, the object tracker has to identify the three separate - detections and needs to track it across the subsequent frames (with - the help of a unique ID). - -Deep SORT ---------- - -`Deep SORT `__ can be defined as the -tracking algorithm which tracks objects not only based on the velocity -and motion of the object but also the appearance of the object. It is -made of three key components which are as follows: |deepsort| - -1. **Detection** - - This is the first step in the tracking module. In this step, a deep - learning model will be used to detect the objects in the frame that - are to be tracked. These detections are then passed on to the next - step. - -2. **Prediction** - - In this step, we use Kalman filter [1] framework to predict a target - bounding box of each tracking object in the next frame. There are two - states of prediction output: ``confirmed`` and ``unconfirmed``. A new - track comes with a state of ``unconfirmed`` by default, and it can be - turned into ``confirmed`` when a certain number of consecutive - detections are matched with this new track. Meanwhile, if a matched - track is missed over a specific time, it will be deleted as well. - -3. **Data association and update** - - Now, we have to match the target bounding box with the detected - bounding box, and update track identities. A conventional way to - solve the association between the predicted Kalman states and newly - arrived measurements is to build an assignment problem with the - Hungarian algorithm [2]. In this problem formulation, we integrate - motion and appearance information through a combination of two - appropriate metrics. The cost used for the first matching step is set - as a combination of the Mahalanobis and the cosine distances. The - `Mahalanobis - distance `__ is - used to incorporate motion information and the cosine distance is - used to calculate similarity between two objects. Cosine distance is - a metric that helps the tracker recover identities in case of - long-term occlusion and motion estimation also fails. For this - purposes, a reidentification model will be implemented to produce a - vector in high-dimensional space that represents the appearance of - the object. Using these simple things can make the tracker even more - powerful and accurate. - - In the second matching stage, we will run intersection over - union(IOU) association as proposed in the original SORT algorithm [3] - on the set of unconfirmed and unmatched tracks from the previous - step. If the IOU of detection and target is less than a certain - threshold value called ``IOUmin`` then that assignment is rejected. - This helps to account for sudden appearance changes, for example, due - to partial occlusion with static scene geometry, and to increase - robustness against erroneous. - - When detection result is associated with a target, the detected - bounding box is used to update the target state. - --------------- - -[1] R. Kalman, “A New Approach to Linear Filtering and Prediction -Problems”, Journal of Basic Engineering, vol. 82, no. Series D, -pp. 35-45, 1960. - -[2] H. W. Kuhn, “The Hungarian method for the assignment problem”, Naval -Research Logistics Quarterly, vol. 2, pp. 83-97, 1955. - -[3] A. Bewley, G. Zongyuan, F. Ramos, and B. Upcroft, “Simple online and -realtime tracking,” in ICIP, 2016, pp. 3464–3468. - -.. |deepsort| image:: https://user-images.githubusercontent.com/91237924/221744683-0042eff8-2c41-43b8-b3ad-b5929bafb60b.png - - -**Table of contents:** - - -- `Imports <#imports>`__ -- `Download the Model <#download-the-model>`__ -- `Load model <#load-model>`__ - - - `Select inference device <#select-inference-device>`__ - -- `Data Processing <#data-processing>`__ -- `Test person reidentification - model <#test-person-reidentification-model>`__ - - - `Visualize data <#visualize-data>`__ - - `Compare two persons <#compare-two-persons>`__ - -- `Main Processing Function <#main-processing-function>`__ -- `Run <#run>`__ - - - `Initialize tracker <#initialize-tracker>`__ - - `Run Live Person Tracking <#run-live-person-tracking>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - %pip install -q "openvino>=2024.0.0" - %pip install -q opencv-python requests scipy tqdm "matplotlib>=3.4" - -Imports -------- - - - -.. code:: ipython3 - - import collections - from pathlib import Path - import time - - import numpy as np - import cv2 - from IPython import display - import matplotlib.pyplot as plt - import openvino as ov - -.. code:: ipython3 - - # Import local modules - import requests - - if not Path("./notebook_utils.py").exists(): - # Fetch `notebook_utils` module - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - import notebook_utils as utils - from deepsort_utils.tracker import Tracker - from deepsort_utils.nn_matching import NearestNeighborDistanceMetric - from deepsort_utils.detection import ( - Detection, - compute_color_for_labels, - xywh_to_xyxy, - xywh_to_tlwh, - tlwh_to_xyxy, - ) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("person-tracking.ipynb") - -Download the Model ------------------- - - - -We will use pre-trained models from OpenVINO’s `Open Model -Zoo `__ -to start the test. - -In this case, `person detection -model `__ -is deployed to detect the person in each frame of the video, and -`reidentification -model `__ -is used to output embedding vector to match a pair of images of a person -by the cosine distance. - -.. code:: ipython3 - - from notebook_utils import download_ir_model - - # A directory where the model will be downloaded. - base_model_dir = "model" - precision = "FP16" - # The name of the model from Open Model Zoo - detection_model_name = "person-detection-0202" - download_det_model_url = ( - f"https://storage.openvinotoolkit.org/repositories/open_model_zoo/2023.0/models_bin/1/{detection_model_name}/{precision}/{detection_model_name}.xml" - ) - detection_model_path = Path(base_model_dir) / detection_model_name / precision / f"{detection_model_name}.xml" - - if not detection_model_path.exists(): - - download_ir_model(download_det_model_url, Path(base_model_dir) / detection_model_name / precision) - - reidentification_model_name = "person-reidentification-retail-0287" - download_reid_model_url = f"https://storage.openvinotoolkit.org/repositories/open_model_zoo/2023.0/models_bin/1/{reidentification_model_name}/{precision}/{reidentification_model_name}.xml" - reidentification_model_path = Path(base_model_dir) / reidentification_model_name / precision / f"{reidentification_model_name}.xml" - - if not reidentification_model_path.exists(): - download_ir_model(download_reid_model_url, Path(base_model_dir) / reidentification_model_name / precision) - -Load model ----------- - - - -Define a common class for model loading and predicting. - -There are four main steps for OpenVINO model initialization, and they -are required to run for only once before inference loop. 1. Initialize -OpenVINO Runtime. 2. Read the network from ``*.bin`` and ``*.xml`` files -(weights and architecture). 3. Compile the model for device. 4. Get -input and output names of nodes. - -In this case, we can put them all in a class constructor function. - -To let OpenVINO automatically select the best device for inference just -use ``AUTO``. In most cases, the best device to use is ``GPU`` (better -performance, but slightly longer startup time). - -.. code:: ipython3 - - core = ov.Core() - - - class Model: - """ - This class represents a OpenVINO model object. - - """ - - def __init__(self, model_path, batchsize=1, device="AUTO"): - """ - Initialize the model object - - Parameters - ---------- - model_path: path of inference model - batchsize: batch size of input data - device: device used to run inference - """ - self.model = core.read_model(model=model_path) - self.input_layer = self.model.input(0) - self.input_shape = self.input_layer.shape - self.height = self.input_shape[2] - self.width = self.input_shape[3] - - for layer in self.model.inputs: - input_shape = layer.partial_shape - input_shape[0] = batchsize - self.model.reshape({layer: input_shape}) - self.compiled_model = core.compile_model(model=self.model, device_name=device) - self.output_layer = self.compiled_model.output(0) - - def predict(self, input): - """ - Run inference - - Parameters - ---------- - input: array of input data - """ - result = self.compiled_model(input)[self.output_layer] - return result - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device = utils.device_widget() - - device - -.. code:: ipython3 - - detector = Model(detection_model_path, device=device.value) - # since the number of detection object is uncertain, the input batch size of reid model should be dynamic - extractor = Model(reidentification_model_path, -1, device.value) - -Data Processing ---------------- - - - -Data Processing includes data preprocess and postprocess functions. - -Data preprocess function is used to change the layout and shape of input -data, according to requirement of the network input format. - Data -postprocess function is used to extract the useful information from -network’s original output and visualize it. - -.. code:: ipython3 - - def preprocess(frame, height, width): - """ - Preprocess a single image - - Parameters - ---------- - frame: input frame - height: height of model input data - width: width of model input data - """ - resized_image = cv2.resize(frame, (width, height)) - resized_image = resized_image.transpose((2, 0, 1)) - input_image = np.expand_dims(resized_image, axis=0).astype(np.float32) - return input_image - - - def batch_preprocess(img_crops, height, width): - """ - Preprocess batched images - - Parameters - ---------- - img_crops: batched input images - height: height of model input data - width: width of model input data - """ - img_batch = np.concatenate([preprocess(img, height, width) for img in img_crops], axis=0) - return img_batch - - - def process_results(h, w, results, thresh=0.5): - """ - postprocess detection results - - Parameters - ---------- - h, w: original height and width of input image - results: raw detection network output - thresh: threshold for low confidence filtering - """ - # The 'results' variable is a [1, 1, N, 7] tensor. - detections = results.reshape(-1, 7) - boxes = [] - labels = [] - scores = [] - for i, detection in enumerate(detections): - _, label, score, xmin, ymin, xmax, ymax = detection - # Filter detected objects. - if score > thresh: - # Create a box with pixels coordinates from the box with normalized coordinates [0,1]. - boxes.append( - [ - (xmin + xmax) / 2 * w, - (ymin + ymax) / 2 * h, - (xmax - xmin) * w, - (ymax - ymin) * h, - ] - ) - labels.append(int(label)) - scores.append(float(score)) - - if len(boxes) == 0: - boxes = np.array([]).reshape(0, 4) - scores = np.array([]) - labels = np.array([]) - return np.array(boxes), np.array(scores), np.array(labels) - - - def draw_boxes(img, bbox, identities=None): - """ - Draw bounding box in original image - - Parameters - ---------- - img: original image - bbox: coordinate of bounding box - identities: identities IDs - """ - for i, box in enumerate(bbox): - x1, y1, x2, y2 = [int(i) for i in box] - # box text and bar - id = int(identities[i]) if identities is not None else 0 - color = compute_color_for_labels(id) - label = "{}{:d}".format("", id) - t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 2, 2)[0] - cv2.rectangle(img, (x1, y1), (x2, y2), color, 2) - cv2.rectangle(img, (x1, y1), (x1 + t_size[0] + 3, y1 + t_size[1] + 4), color, -1) - cv2.putText( - img, - label, - (x1, y1 + t_size[1] + 4), - cv2.FONT_HERSHEY_PLAIN, - 1.6, - [255, 255, 255], - 2, - ) - return img - - - def cosin_metric(x1, x2): - """ - Calculate the consin distance of two vector - - Parameters - ---------- - x1, x2: input vectors - """ - return np.dot(x1, x2) / (np.linalg.norm(x1) * np.linalg.norm(x2)) - -Test person reidentification model ----------------------------------- - - - -The reidentification network outputs a blob with the ``(1, 256)`` shape -named ``reid_embedding``, which can be compared with other descriptors -using the cosine distance. - -Visualize data -~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - base_file_link = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/person_" - image_indices = ["1_1.png", "1_2.png", "2_1.png"] - image_paths = [ - utils.download_file(base_file_link + image_index, directory="data") for image_index in image_indices if not (Path("data") / image_index).exists() - ] - image1, image2, image3 = [cv2.cvtColor(cv2.imread(str(image_path)), cv2.COLOR_BGR2RGB) for image_path in image_paths] - - # Define titles with images. - data = {"Person 1": image1, "Person 2": image2, "Person 3": image3} - - # Create a subplot to visualize images. - fig, axs = plt.subplots(1, len(data.items()), figsize=(5, 5)) - - # Fill the subplot. - for ax, (name, image) in zip(axs, data.items()): - ax.axis("off") - ax.set_title(name) - ax.imshow(image) - - # Display an image. - plt.show(fig) - -Compare two persons -~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # Metric parameters - MAX_COSINE_DISTANCE = 0.6 # threshold of matching object - input_data = [image2, image3] - img_batch = batch_preprocess(input_data, extractor.height, extractor.width) - features = extractor.predict(img_batch) - sim = cosin_metric(features[0], features[1]) - if sim >= 1 - MAX_COSINE_DISTANCE: - print(f"Same person (confidence: {sim})") - else: - print(f"Different person (confidence: {sim})") - -Main Processing Function ------------------------- - - - -Run person tracking on the specified source. Either a webcam feed or a -video file. - -.. code:: ipython3 - - # Main processing function to run person tracking. - def run_person_tracking(source=0, flip=False, use_popup=False, skip_first_frames=0): - """ - Main function to run the person tracking: - 1. Create a video player to play with target fps (utils.VideoPlayer). - 2. Prepare a set of frames for person tracking. - 3. Run AI inference for person tracking. - 4. Visualize the results. - - Parameters: - ---------- - source: The webcam number to feed the video stream with primary webcam set to "0", or the video path. - flip: To be used by VideoPlayer function for flipping capture image. - use_popup: False for showing encoded frames over this notebook, True for creating a popup window. - skip_first_frames: Number of frames to skip at the beginning of the video. - """ - player = None - try: - # Create a video player to play with target fps. - player = utils.VideoPlayer( - source=source, - size=(700, 450), - flip=flip, - fps=24, - skip_first_frames=skip_first_frames, - ) - # Start capturing. - player.start() - if use_popup: - title = "Press ESC to Exit" - cv2.namedWindow(winname=title, flags=cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE) - - processing_times = collections.deque() - while True: - # Grab the frame. - frame = player.next() - if frame is None: - print("Source ended") - break - # If the frame is larger than full HD, reduce size to improve the performance. - - # Resize the image and change dims to fit neural network input. - h, w = frame.shape[:2] - input_image = preprocess(frame, detector.height, detector.width) - - # Measure processing time. - start_time = time.time() - # Get the results. - output = detector.predict(input_image) - stop_time = time.time() - processing_times.append(stop_time - start_time) - if len(processing_times) > 200: - processing_times.popleft() - - _, f_width = frame.shape[:2] - # Mean processing time [ms]. - processing_time = np.mean(processing_times) * 1100 - fps = 1000 / processing_time - - # Get poses from detection results. - bbox_xywh, score, label = process_results(h, w, results=output) - - img_crops = [] - for box in bbox_xywh: - x1, y1, x2, y2 = xywh_to_xyxy(box, h, w) - img = frame[y1:y2, x1:x2] - img_crops.append(img) - - # Get reidentification feature of each person. - if img_crops: - # preprocess - img_batch = batch_preprocess(img_crops, extractor.height, extractor.width) - features = extractor.predict(img_batch) - else: - features = np.array([]) - - # Wrap the detection and reidentification results together - bbox_tlwh = xywh_to_tlwh(bbox_xywh) - detections = [Detection(bbox_tlwh[i], features[i]) for i in range(features.shape[0])] - - # predict the position of tracking target - tracker.predict() - - # update tracker - tracker.update(detections) - - # update bbox identities - outputs = [] - for track in tracker.tracks: - if not track.is_confirmed() or track.time_since_update > 1: - continue - box = track.to_tlwh() - x1, y1, x2, y2 = tlwh_to_xyxy(box, h, w) - track_id = track.track_id - outputs.append(np.array([x1, y1, x2, y2, track_id], dtype=np.int32)) - if len(outputs) > 0: - outputs = np.stack(outputs, axis=0) - - # draw box for visualization - if len(outputs) > 0: - bbox_tlwh = [] - bbox_xyxy = outputs[:, :4] - identities = outputs[:, -1] - frame = draw_boxes(frame, bbox_xyxy, identities) - - cv2.putText( - img=frame, - text=f"Inference time: {processing_time:.1f}ms ({fps:.1f} FPS)", - org=(20, 40), - fontFace=cv2.FONT_HERSHEY_COMPLEX, - fontScale=f_width / 1000, - color=(0, 0, 255), - thickness=1, - lineType=cv2.LINE_AA, - ) - - if use_popup: - cv2.imshow(winname=title, mat=frame) - key = cv2.waitKey(1) - # escape = 27 - if key == 27: - break - else: - # Encode numpy array to jpg. - _, encoded_img = cv2.imencode(ext=".jpg", img=frame, params=[cv2.IMWRITE_JPEG_QUALITY, 100]) - # Create an IPython image. - i = display.Image(data=encoded_img) - # Display the image in this notebook. - display.clear_output(wait=True) - display.display(i) - - # ctrl-c - except KeyboardInterrupt: - print("Interrupted") - # any different error - except RuntimeError as e: - print(e) - finally: - if player is not None: - # Stop capturing. - player.stop() - if use_popup: - cv2.destroyAllWindows() - -Run ---- - - - -Initialize tracker -~~~~~~~~~~~~~~~~~~ - - - -Before running a new tracking task, we have to reinitialize a Tracker -object - -.. code:: ipython3 - - NN_BUDGET = 100 - MAX_COSINE_DISTANCE = 0.6 # threshold of matching object - metric = NearestNeighborDistanceMetric("cosine", MAX_COSINE_DISTANCE, NN_BUDGET) - tracker = Tracker(metric, max_iou_distance=0.7, max_age=70, n_init=3) - -Run Live Person Tracking -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Use a webcam as the video input. By default, the primary webcam is set -with ``source=0``. If you have multiple webcams, each one will be -assigned a consecutive number starting at 0. Set ``flip=True`` when -using a front-facing camera. Some web browsers, especially Mozilla -Firefox, may cause flickering. If you experience flickering, set -``use_popup=True``. - -If you do not have a webcam, you can still run this demo with a video -file. Any `format supported by -OpenCV `__ -will work. - -.. code:: ipython3 - - USE_WEBCAM = False - - cam_id = 0 - video_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/people.mp4" - video_file = Path("people.mp4") - source = cam_id if USE_WEBCAM else video_file - - if not USE_WEBCAM and not video_file.exists(): - utils.download_file(video_url) - - run_person_tracking(source=source, flip=USE_WEBCAM, use_popup=False) diff --git a/docs/notebooks/phi-3-vision-with-output.rst b/docs/notebooks/phi-3-vision-with-output.rst deleted file mode 100644 index 772a7216bc16f2..00000000000000 --- a/docs/notebooks/phi-3-vision-with-output.rst +++ /dev/null @@ -1,456 +0,0 @@ -Visual-language assistant with Phi3-Vision and OpenVINO -------------------------------------------------------- - -The Phi-3-Vision is a lightweight, state-of-the-art open multimodal -model built upon datasets which include - synthetic data and filtered -publicly available websites - with a focus on very high-quality, -reasoning dense data both on text and vision. The model belongs to the -Phi-3 model family, and the multimodal version comes with 128K context -length (in tokens) it can support. The model underwent a rigorous -enhancement process, incorporating both supervised fine-tuning and -direct preference optimization to ensure precise instruction adherence -and robust safety measures. More details about model can be found in -`model blog -post `__, -`technical report `__, -`Phi-3-cookbook `__ - -In this tutorial we consider how to use Phi-3-Vision model to build -multimodal chatbot using `Optimum -Intel `__. Additionally, -we optimize model to low precision using -`NNCF `__ - -**Table of contents:** - -- `Prerequisites <#prerequisites>`__ -- `Select Model <#select-model>`__ -- `Convert and Optimize model <#convert-and-optimize-model>`__ - - - `Compress model weights to - 4-bit <#compress-model-weights-to-4-bit>`__ - -- `Select inference device <#select-inference-device>`__ -- `Run OpenVINO model <#run-openvino-model>`__ -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -install required packages and setup helper functions. - -.. code:: ipython3 - - import platform - - %pip install -q -U "torch>=2.1" "torchvision" "transformers>=4.45" "protobuf>=3.20" "gradio>=4.26" "Pillow" "accelerate" "tqdm" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install --pre -qU "openvino>=2024.6.0" "openvino-tokenizers>=2024.6.0" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - %pip install -q -U "nncf>=2.14.0" - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0" - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("cmd_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py") - open("cmd_helper.py", "w").write(r.text) - - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/phi-3-vision/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - if not Path("notebook_utils.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("phi-3-vision.ipynb") - -Select Model ------------- - - - -The tutorial supports the following models from Phi-3 model family: - -- `Phi-3.5-vision-instruct `__ -- `Phi-3-vision-128k-instruct `__ - -You can select one from the provided options below. - -.. code:: ipython3 - - import ipywidgets as widgets - - # Select model - model_ids = [ - "microsoft/Phi-3.5-vision-instruct", - "microsoft/Phi-3-vision-128k-instruct", - ] - - model_dropdown = widgets.Dropdown( - options=model_ids, - value=model_ids[0], - description="Model:", - disabled=False, - ) - - model_dropdown - - - - -.. parsed-literal:: - - Dropdown(description='Model:', options=('microsoft/Phi-3.5-vision-instruct', 'microsoft/Phi-3-vision-128k-inst… - - - -.. code:: ipython3 - - model_id = model_dropdown.value - print(f"Selected {model_id}") - MODEL_DIR = Path(model_id.split("/")[-1]) - - -.. parsed-literal:: - - Selected microsoft/Phi-3.5-vision-instruct - - -Convert and Optimize model --------------------------- - - - -Phi-3-vision is PyTorch model. OpenVINO supports PyTorch models via -conversion to OpenVINO Intermediate Representation (IR). `OpenVINO model -conversion -API `__ -should be used for these purposes. ``ov.convert_model`` function accepts -original PyTorch model instance and example input for tracing and -returns ``ov.Model`` representing this model in OpenVINO framework. -Converted model can be used for saving on disk using ``ov.save_model`` -function or directly loading on device using ``core.compile_model``. - -For convenience, we will use OpenVINO integration with HuggingFace -Optimum. `Optimum -Intel `__ is the -interface between the Transformers and Diffusers libraries and the -different tools and libraries provided by Intel to accelerate end-to-end -pipelines on Intel architectures. - -Among other use cases, Optimum Intel provides a simple interface to -optimize your Transformers and Diffusers models, convert them to the -OpenVINO Intermediate Representation (IR) format and run inference using -OpenVINO Runtime. ``optimum-cli`` provides command line interface for -model conversion and optimization. - -General command format: - -.. code:: bash - - optimum-cli export openvino --model --task - -where task is task to export the model for, if not specified, the task -will be auto-inferred based on the model. You can find a mapping between -tasks and model classes in Optimum TaskManager -`documentation `__. -Additionally, you can specify weights compression using -``--weight-format`` argument with one of following options: ``fp32``, -``fp16``, ``int8`` and ``int4``. For int8 and int4 -`nncf `__ will be used for -weight compression. More details about model export provided in `Optimum -Intel -documentation `__. - -Compress model weights to 4-bit -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -For reducing memory -consumption, weights compression optimization can be applied using -`NNCF `__ during run Optimum -Intel CLI. - -.. raw:: html - -
- -.. raw:: html - - - -Click here for more details about weight compression - -.. raw:: html - - - -Weight compression aims to reduce the memory footprint of a model. It -can also lead to significant performance improvement for large -memory-bound models, such as Large Language Models (LLMs). LLMs and -other models, which require extensive memory to store the weights during -inference, can benefit from weight compression in the following ways: - -- enabling the inference of exceptionally large models that cannot be - accommodated in the memory of the device; - -- improving the inference performance of the models by reducing the - latency of the memory access when computing the operations with - weights, for example, Linear layers. - -`Neural Network Compression Framework -(NNCF) `__ provides 4-bit / -8-bit mixed weight quantization as a compression method primarily -designed to optimize LLMs. The main difference between weights -compression and full model quantization (post-training quantization) is -that activations remain floating-point in the case of weights -compression which leads to a better accuracy. Weight compression for -LLMs provides a solid inference performance improvement which is on par -with the performance of the full model quantization. In addition, weight -compression is data-free and does not require a calibration dataset, -making it easy to use. - -``nncf.compress_weights`` function can be used for performing weights -compression. The function accepts an OpenVINO model and other -compression parameters. Compared to INT8 compression, INT4 compression -improves performance even more, but introduces a minor drop in -prediction quality. - -More details about weights compression, can be found in `OpenVINO -documentation `__. - -.. raw:: html - -
- -.. code:: ipython3 - - to_compress = widgets.Checkbox(value=True, description="Compress model", disabled=False) - - to_compress - -.. code:: ipython3 - - from cmd_helper import optimum_cli - - model_dir = MODEL_DIR / "INT4" if to_compress.value else MODEL_DIR / "FP16" - if not model_dir.exists(): - optimum_cli(model_id, model_dir, additional_args={"weight-format": "int4" if to_compress.value else "fp16", "trust-remote-code": ""}) - - - -**Export command:** - - - -``optimum-cli export openvino --model microsoft/Phi-3.5-vision-instruct Phi-3.5-vision-instruct/INT4 --weight-format int4 --trust-remote-code`` - - -.. parsed-literal:: - - 2024-12-24 08:39:28.193255: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-12-24 08:39:28.205380: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered - WARNING: All log messages before absl::InitializeLog() is called are written to STDERR - E0000 00:00:1735015168.220063 230613 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered - E0000 00:00:1735015168.224457 230613 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered - 2024-12-24 08:39:28.238718: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00, 2.14s/it] - The class `optimum.bettertransformers.transformation.BetterTransformer` is deprecated and will be removed in a future release. - WARNING:root:Cannot apply model.to_bettertransformer because of the exception: - The model type phi3_v is not yet supported to be used with BetterTransformer. Feel free to open an issue at https://github.com/huggingface/optimum/issues if you would like this model type to be supported. Currently supported models are: dict_keys(['albert', 'bark', 'bart', 'bert', 'bert-generation', 'blenderbot', 'bloom', 'camembert', 'blip-2', 'clip', 'codegen', 'data2vec-text', 'deit', 'distilbert', 'electra', 'ernie', 'fsmt', 'gpt2', 'gptj', 'gpt_neo', 'gpt_neox', 'hubert', 'layoutlm', 'm2m_100', 'marian', 'markuplm', 'mbart', 'opt', 'pegasus', 'rembert', 'prophetnet', 'roberta', 'roc_bert', 'roformer', 'splinter', 'tapas', 't5', 'vilt', 'vit', 'vit_mae', 'vit_msn', 'wav2vec2', 'xlm-roberta', 'yolos']).. Usage model with stateful=True may be non-effective if model does not contain torch.functional.scaled_dot_product_attention - `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. - /home/ea/work/py311/lib/python3.11/site-packages/transformers/cache_utils.py:458: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. - or len(self.key_cache[layer_idx]) == 0 # the layer has no cache - /home/ea/work/py311/lib/python3.11/site-packages/transformers/modeling_attn_mask_utils.py:116: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal: - /home/ea/work/py311/lib/python3.11/site-packages/optimum/exporters/onnx/model_patcher.py:306: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if past_key_values_length > 0: - /home/ea/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3.5-vision-instruct/4a0d683eba9f1d0cbfb6151705d1ee73c25a80ca/modeling_phi3_v.py:444: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - seq_len = seq_len or torch.max(position_ids) + 1 - /home/ea/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3.5-vision-instruct/4a0d683eba9f1d0cbfb6151705d1ee73c25a80ca/modeling_phi3_v.py:445: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if seq_len > self.original_max_position_embeddings: - /home/ea/work/py311/lib/python3.11/site-packages/nncf/torch/dynamic_graph/wrappers.py:85: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. - op1 = operator(\*args, \*\*kwargs) - /home/ea/work/py311/lib/python3.11/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. - elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors - /home/ea/work/py311/lib/python3.11/site-packages/transformers/models/clip/modeling_clip.py:243: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Weight compression mode │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ int8_asym │ 3% (1 / 129) │ 0% (0 / 128) │ - ├───────────────────────────┼─────────────────────────────┼────────────────────────────────────────┤ - │ int4_asym │ 97% (128 / 129) │ 100% (128 / 128) │ - ┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 0:01:58 • 0:00:00;0;104;181m0:00:01181m0:00:05 - [?25hINFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Weight compression mode │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ int8_sym │ 100% (139 / 139) │ 100% (139 / 139) │ - ┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 0:00:01 • 0:00:0001 • 0:00:01 - [?25hINFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Weight compression mode │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ int8_sym │ 100% (1 / 1) │ 100% (1 / 1) │ - ┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 0:00:00 • 0:00:00 - [?25hINFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Weight compression mode │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ int8_sym │ 100% (2 / 2) │ 100% (2 / 2) │ - ┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 0:00:00 • 0:00:00 - [?25h - -Select inference device ------------------------ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget(default="AUTO", exclude=["NPU"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Run OpenVINO model ------------------- - - - -OpenVINO integration with Optimum Intel provides ready-to-use API for -model inference that can be used for smooth integration with -transformers-based solutions. For loading model, we will use -``OVModelForVisualCausalLM`` class that have compatible interface with -Transformers LLaVA implementation. For loading a model, -``from_pretrained`` method should be used. It accepts path to the model -directory or model_id from HuggingFace hub (if model is not converted to -OpenVINO format, conversion will be triggered automatically). -Additionally, we can provide an inference device, quantization config -(if model has not been quantized yet) and device-specific OpenVINO -Runtime configuration. More details about model inference with Optimum -Intel can be found in -`documentation `__. - -.. code:: ipython3 - - from optimum.intel.openvino import OVModelForVisualCausalLM - - model = OVModelForVisualCausalLM.from_pretrained(model_dir, device=device.value, trust_remote_code=True) - -.. code:: ipython3 - - import requests - from PIL import Image - - image_path = Path("cat.png") - - if not image_path.exists(): - url = "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11" - image = Image.open(requests.get(url, stream=True).raw) - image.save(image_path) - else: - image = Image.open(image_path) - - print("Question:\n What is unusual on this picture?") - image - - -.. parsed-literal:: - - Question: - What is unusual on this picture? - - - - -.. image:: phi-3-vision-with-output_files/phi-3-vision-with-output_14_1.png - - - -.. code:: ipython3 - - from transformers import AutoProcessor, TextStreamer - - messages = [ - {"role": "user", "content": "<|image_1|>\nWhat is unusual on this picture?"}, - ] - - processor = AutoProcessor.from_pretrained(MODEL_DIR / "INT4" if to_compress.value else "FP16", trust_remote_code=True) - - prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - - inputs = processor(prompt, [image], return_tensors="pt") - - generation_args = {"max_new_tokens": 50, "do_sample": False, "streamer": TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)} - - print("Answer:") - generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args) - - -.. parsed-literal:: - - Answer: - A cat is lying in a box. - - -Interactive demo ----------------- - - - -.. code:: ipython3 - - from gradio_helper import make_demo - - demo = make_demo(model, processor) - - try: - demo.launch(debug=True, height=600) - except Exception: - demo.launch(debug=True, share=True, height=600) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/phi-3-vision-with-output_files/phi-3-vision-with-output_14_1.jpg b/docs/notebooks/phi-3-vision-with-output_files/phi-3-vision-with-output_14_1.jpg deleted file mode 100644 index c6aeec77cd3cb2..00000000000000 --- a/docs/notebooks/phi-3-vision-with-output_files/phi-3-vision-with-output_14_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5fc0d22d75f23474fb4f8aec8c0bf0fdf5d9377f3379e82a3887003e6da47e7e -size 60425 diff --git a/docs/notebooks/phi-3-vision-with-output_files/phi-3-vision-with-output_14_1.png b/docs/notebooks/phi-3-vision-with-output_files/phi-3-vision-with-output_14_1.png deleted file mode 100644 index c6673a757ab5dc..00000000000000 --- a/docs/notebooks/phi-3-vision-with-output_files/phi-3-vision-with-output_14_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c715d8adee4bf7519690de20b57ef2edaa2f914c86a64d107f99a919dcdad218 -size 854224 diff --git a/docs/notebooks/photo-maker-with-output.rst b/docs/notebooks/photo-maker-with-output.rst deleted file mode 100644 index 19be236acb8fdd..00000000000000 --- a/docs/notebooks/photo-maker-with-output.rst +++ /dev/null @@ -1,841 +0,0 @@ -Text-to-image generation using PhotoMaker and OpenVINO -====================================================== - -PhotoMaker is an efficient personalized text-to-image generation method, -which mainly encodes an arbitrary number of input ID images into a stack -ID embedding for preserving ID information. Such an embedding, serving -as a unified ID representation, can not only encapsulate the -characteristics of the same input ID comprehensively, but also -accommodate the characteristics of different IDs for subsequent -integration. This paves the way for more intriguing and practically -valuable applications. Users can input one or a few face photos, along -with a text prompt, to receive a customized photo or painting (no -training required!). Additionally, this model can be adapted to any base -model based on ``SDXL`` or used in conjunction with other ``LoRA`` -modules.More details about PhotoMaker can be found in the `technical -report `__. - -This notebook explores how to speed up PhotoMaker pipeline using -OpenVINO. - - -**Table of contents:** - - -- `PhotoMaker pipeline - introduction <#photomaker-pipeline-introduction>`__ -- `Prerequisites <#prerequisites>`__ -- `Load original pipeline and prepare models for - conversion <#load-original-pipeline-and-prepare-models-for-conversion>`__ -- `Convert models to OpenVINO Intermediate representation (IR) - format <#convert-models-to-openvino-intermediate-representation-ir-format>`__ - - - `ID Encoder <#id-encoder>`__ - - `Text Encoder <#text-encoder>`__ - - `U-Net <#u-net>`__ - - `VAE Decoder <#vae-decoder>`__ - -- `Prepare Inference pipeline <#prepare-inference-pipeline>`__ - - - `Select inference device for Stable Diffusion - pipeline <#select-inference-device-for-stable-diffusion-pipeline>`__ - - `Compile models and create their Wrappers for - inference <#compile-models-and-create-their-wrappers-for-inference>`__ - -- `Running Text-to-Image Generation with - OpenVINO <#running-text-to-image-generation-with-openvino>`__ -- `Interactive Demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -PhotoMaker pipeline introduction --------------------------------- - - - -For the proposed PhotoMaker, we first obtain the text embedding and -image embeddings from ``text encoder(s)`` and ``image(ID) encoder``, -respectively. Then, we extract the fused embedding by merging the -corresponding class embedding (e.g., man and woman) and each image -embedding. Next, we concatenate all fused embeddings along the length -dimension to form the stacked ID embedding. Finally, we feed the stacked -ID embedding to all cross-attention layers for adaptively merging the ID -content in the ``diffusion model``. Note that although we use images of -the same ID with the masked background during training, we can directly -input images of different IDs without background distortion to create a -new ID during inference. - -Prerequisites -------------- - - - -Clone PhotoMaker repository - -.. code:: ipython3 - - from pathlib import Path - import requests - - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - -.. code:: ipython3 - - from cmd_helper import clone_repo - - - clone_repo("https://github.com/TencentARC/PhotoMaker.git", "1e78aa6514c11a84ef1be27b56c7c72d6c70f8fc", add_to_sys_path=False) - -Install required packages - -.. code:: ipython3 - - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu \ - transformers "torch>=2.1" "diffusers>=0.26,<0.30" "gradio>=4.19" "openvino>=2024.0.0" "einops" torchvision "peft>=0.6.2" "nncf>=2.9.0" "protobuf==3.20.3" "insightface" "onnxruntime" - -Prepare PyTorch models - -.. code:: ipython3 - - adapter_id = "TencentARC/PhotoMaker" - base_model_id = "SG161222/RealVisXL_V3.0" - - TEXT_ENCODER_OV_PATH = Path("model/text_encoder.xml") - TEXT_ENCODER_2_OV_PATH = Path("model/text_encoder_2.xml") - UNET_OV_PATH = Path("model/unet.xml") - ID_ENCODER_OV_PATH = Path("model/id_encoder.xml") - VAE_DECODER_OV_PATH = Path("model/vae_decoder.xml") - -Load original pipeline and prepare models for conversion --------------------------------------------------------- - - - -For exporting each PyTorch model, we will download the ``ID encoder`` -weight, ``LoRa`` weight from HuggingFace hub, then using the -``PhotoMakerStableDiffusionXLPipeline`` object from repository of -PhotoMaker to generate the original PhotoMaker pipeline. - -.. code:: ipython3 - - import torch - import numpy as np - import os - from PIL import Image - from pathlib import Path - from PhotoMaker.photomaker.model import PhotoMakerIDEncoder - from PhotoMaker.photomaker.pipeline import PhotoMakerStableDiffusionXLPipeline - from diffusers import EulerDiscreteScheduler - import gc - - trigger_word = "img" - - - def load_original_pytorch_pipeline_components(photomaker_path: str, base_model_id: str): - # Load base model - pipe = PhotoMakerStableDiffusionXLPipeline.from_pretrained(base_model_id, use_safetensors=True).to("cpu") - - # Load PhotoMaker checkpoint - pipe.load_photomaker_adapter( - os.path.dirname(photomaker_path), - subfolder="", - weight_name=os.path.basename(photomaker_path), - trigger_word=trigger_word, - ) - pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.fuse_lora() - gc.collect() - return pipe - -.. code:: ipython3 - - from huggingface_hub import hf_hub_download - - photomaker_path = hf_hub_download(repo_id=adapter_id, filename="photomaker-v1.bin", repo_type="model") - - pipe = load_original_pytorch_pipeline_components(photomaker_path, base_model_id) - - - -.. parsed-literal:: - - Loading pipeline components...: 0%| | 0/7 [00:00`__. Weight -compression aims to reduce the memory footprint of models, which require -extensive memory to store the weights during inference, can benefit from -weight compression in the following ways: - -- enabling the inference of exceptionally large models that cannot be - accommodated in the memory of the device; - -- improving the inference performance of the models by reducing the - latency of the memory access when computing the operations with - weights, for example, Linear layers. - -`Neural Network Compression Framework -(NNCF) `__ provides 4-bit / -8-bit mixed weight quantization as a compression method. The main -difference between weights compression and full model quantization -(post-training quantization) is that activations remain floating-point -in the case of weights compression which leads to a better accuracy. - -``nncf.compress_weights`` function can be used for performing weights -compression. The function accepts an OpenVINO model and other -compression parameters. - -More details about weights compression can be found in `OpenVINO -documentation `__. - -.. code:: ipython3 - - import openvino as ov - import nncf - - - def flattenize_inputs(inputs): - """ - Helper function for resolve nested input structure (e.g. lists or tuples of tensors) - """ - flatten_inputs = [] - for input_data in inputs: - if input_data is None: - continue - if isinstance(input_data, (list, tuple)): - flatten_inputs.extend(flattenize_inputs(input_data)) - else: - flatten_inputs.append(input_data) - return flatten_inputs - - - dtype_mapping = { - torch.float32: ov.Type.f32, - torch.float64: ov.Type.f64, - torch.int32: ov.Type.i32, - torch.int64: ov.Type.i64, - torch.bool: ov.Type.boolean, - } - - - def prepare_input_info(input_dict): - """ - Helper function for preparing input info (shapes and data types) for conversion based on example inputs - """ - flatten_inputs = flattenize_inputs(input_dict.values()) - input_info = [] - for input_data in flatten_inputs: - updated_shape = list(input_data.shape) - if input_data.ndim == 5: - updated_shape[1] = -1 - input_info.append((dtype_mapping[input_data.dtype], updated_shape)) - return input_info - - - def convert(model: torch.nn.Module, xml_path: str, example_input, input_info): - """ - Helper function for converting PyTorch model to OpenVINO IR - """ - xml_path = Path(xml_path) - if not xml_path.exists(): - xml_path.parent.mkdir(parents=True, exist_ok=True) - with torch.no_grad(): - ov_model = ov.convert_model(model, example_input=example_input, input=input_info) - ov_model = nncf.compress_weights(ov_model) - ov.save_model(ov_model, xml_path) - - del ov_model - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino - - -ID Encoder -~~~~~~~~~~ - - - -PhotoMaker merged image encoder and fuse module to create an ID Encoder. -It will used to generate image embeddings to update text encoder’s -output(text embeddings) which will be the input for U-Net model. - -.. code:: ipython3 - - id_encoder = pipe.id_encoder - id_encoder.eval() - - - def create_bool_tensor(*size): - new_tensor = torch.zeros((size), dtype=torch.bool) - return new_tensor - - - inputs = { - "id_pixel_values": torch.randn((1, 1, 3, 224, 224)), - "prompt_embeds": torch.randn((1, 77, 2048)), - "class_tokens_mask": create_bool_tensor(1, 77), - } - - input_info = prepare_input_info(inputs) - - convert(id_encoder, ID_ENCODER_OV_PATH, inputs, input_info) - - del id_encoder - gc.collect(); - -Text Encoder -~~~~~~~~~~~~ - - - -The text-encoder is responsible for transforming the input prompt, for -example, “a photo of an astronaut riding a horse” into an embedding -space that can be understood by the U-Net. It is usually a simple -transformer-based encoder that maps a sequence of input tokens to a -sequence of latent text embeddings. - -.. code:: ipython3 - - import types - - text_encoder = pipe.text_encoder - text_encoder.eval() - text_encoder_2 = pipe.text_encoder_2 - text_encoder_2.eval() - - text_encoder.config.output_hidden_states = True - text_encoder.config.return_dict = False - - inputs = {"input_ids": torch.ones((1, 77), dtype=torch.long)} - - input_info = prepare_input_info(inputs) - - convert(text_encoder, TEXT_ENCODER_OV_PATH, inputs, input_info) - - text_encoder_2._orig_forward = text_encoder_2.forward - - - def text_encoder_fwd_wrapper(self, input_ids): - res = self._orig_forward(input_ids, return_dict=True, output_hidden_states=True) - return tuple([v for v in res.values() if v is not None]) - - - text_encoder_2.forward = types.MethodType(text_encoder_fwd_wrapper, text_encoder_2) - - convert(text_encoder_2, TEXT_ENCODER_2_OV_PATH, inputs, input_info) - - del text_encoder - del text_encoder_2 - gc.collect(); - - -.. parsed-literal:: - - WARNING:nncf:NNCF provides best results with torch==2.3.*, while current torch version is 2.4.0+cpu. If you encounter issues, consider switching to torch==2.3.* - - -.. parsed-literal:: - - /home/ea/work/openvino_notebooks_new_clone/nb_env/lib/python3.8/site-packages/transformers/modeling_utils.py:4664: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead - warnings.warn( - /home/ea/work/openvino_notebooks_new_clone/nb_env/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:86: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if input_shape[-1] > 1 or self.sliding_window is not None: - /home/ea/work/openvino_notebooks_new_clone/nb_env/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:162: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if past_key_values_length > 0: - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (194 / 194) │ 100% (194 / 194) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - 18214 - - - -U-Net -~~~~~ - - - -The process of U-Net model conversion remains the same, like for -original Stable Diffusion XL model. - -.. code:: ipython3 - - unet = pipe.unet - unet.eval() - - - class UnetWrapper(torch.nn.Module): - def __init__(self, unet): - super().__init__() - self.unet = unet - - def forward( - self, - sample=None, - timestep=None, - encoder_hidden_states=None, - text_embeds=None, - time_ids=None, - ): - return self.unet.forward( - sample, - timestep, - encoder_hidden_states, - added_cond_kwargs={"text_embeds": text_embeds, "time_ids": time_ids}, - ) - - - inputs = { - "sample": torch.rand([2, 4, 128, 128], dtype=torch.float32), - "timestep": torch.from_numpy(np.array(1, dtype=float)), - "encoder_hidden_states": torch.rand([2, 77, 2048], dtype=torch.float32), - "text_embeds": torch.rand([2, 1280], dtype=torch.float32), - "time_ids": torch.rand([2, 6], dtype=torch.float32), - } - - input_info = prepare_input_info(inputs) - - w_unet = UnetWrapper(unet) - convert(w_unet, UNET_OV_PATH, inputs, input_info) - - del w_unet, unet - gc.collect(); - -VAE Decoder -~~~~~~~~~~~ - - - -The VAE model has two parts, an encoder and a decoder. The encoder is -used to convert the image into a low dimensional latent representation, -which will serve as the input to the U-Net model. The decoder, -conversely, transforms the latent representation back into an image. - -When running Text-to-Image pipeline, we will see that we only need the -VAE decoder. - -.. code:: ipython3 - - vae_decoder = pipe.vae - vae_decoder.eval() - - - class VAEDecoderWrapper(torch.nn.Module): - def __init__(self, vae_decoder): - super().__init__() - self.vae = vae_decoder - - def forward(self, latents): - return self.vae.decode(latents) - - - w_vae_decoder = VAEDecoderWrapper(vae_decoder) - inputs = torch.zeros((1, 4, 128, 128)) - - convert(w_vae_decoder, VAE_DECODER_OV_PATH, inputs, input_info=[1, 4, 128, 128]) - - del w_vae_decoder, vae_decoder - gc.collect(); - -Prepare Inference pipeline --------------------------- - - - -In this example, we will reuse ``PhotoMakerStableDiffusionXLPipeline`` -pipeline to generate the image with OpenVINO, so each model’s object in -this pipeline should be replaced with new OpenVINO model object. - -Select inference device for Stable Diffusion pipeline -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import device_widget - - device = device_widget() - - device - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("photo-maker.ipynb") - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='CPU') - - - -Compile models and create their Wrappers for inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To access original PhotoMaker workflow, we have to create a new wrapper -for each OpenVINO compiled model. For matching original pipeline, part -of OpenVINO model wrapper’s attributes should be reused from original -model objects and inference output must be converted from numpy to -``torch.tensor``. - - - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - - compiled_id_encoder = core.compile_model(ID_ENCODER_OV_PATH, device.value) - compiled_unet = core.compile_model(UNET_OV_PATH, device.value) - compiled_text_encoder = core.compile_model(TEXT_ENCODER_OV_PATH, device.value) - compiled_text_encoder_2 = core.compile_model(TEXT_ENCODER_2_OV_PATH, device.value) - compiled_vae_decoder = core.compile_model(VAE_DECODER_OV_PATH, device.value) - -.. code:: ipython3 - - from collections import namedtuple - - - class OVIDEncoderWrapper(PhotoMakerIDEncoder): - dtype = torch.float32 # accessed in the original workflow - - def __init__(self, id_encoder, orig_id_encoder): - super().__init__() - self.id_encoder = id_encoder - self.modules = orig_id_encoder.modules # accessed in the original workflow - self.config = orig_id_encoder.config # accessed in the original workflow - - def __call__( - self, - *args, - ): - id_pixel_values, prompt_embeds, class_tokens_mask = args - inputs = { - "id_pixel_values": id_pixel_values, - "prompt_embeds": prompt_embeds, - "class_tokens_mask": class_tokens_mask, - } - output = self.id_encoder(inputs)[0] - return torch.from_numpy(output) - -.. code:: ipython3 - - class OVTextEncoderWrapper: - dtype = torch.float32 # accessed in the original workflow - - def __init__(self, text_encoder, orig_text_encoder): - self.text_encoder = text_encoder - self.modules = orig_text_encoder.modules # accessed in the original workflow - self.config = orig_text_encoder.config # accessed in the original workflow - - def __call__(self, input_ids, **kwargs): - inputs = {"input_ids": input_ids} - output = self.text_encoder(inputs) - - hidden_states = [] - hidden_states_len = len(output) - for i in range(1, hidden_states_len): - hidden_states.append(torch.from_numpy(output[i])) - if hidden_states_len - 1 < 2: - hidden_states.append(torch.from_numpy(output[i])) - BaseModelOutputWithPooling = namedtuple("BaseModelOutputWithPooling", "last_hidden_state hidden_states") - output = BaseModelOutputWithPooling(torch.from_numpy(output[0]), hidden_states) - return output - -.. code:: ipython3 - - class OVUnetWrapper: - def __init__(self, unet, unet_orig): - self.unet = unet - self.config = unet_orig.config # accessed in the original workflow - self.add_embedding = unet_orig.add_embedding # accessed in the original workflow - - def __call__(self, *args, **kwargs): - latent_model_input, t = args - inputs = { - "sample": latent_model_input, - "timestep": t, - "encoder_hidden_states": kwargs["encoder_hidden_states"], - "text_embeds": kwargs["added_cond_kwargs"]["text_embeds"], - "time_ids": kwargs["added_cond_kwargs"]["time_ids"], - } - - output = self.unet(inputs) - - return [torch.from_numpy(output[0])] - -.. code:: ipython3 - - class OVVAEDecoderWrapper: - dtype = torch.float32 # accessed in the original workflow - - def __init__(self, vae, vae_orig): - self.vae = vae - self.config = vae_orig.config # accessed in the original workflow - - def decode(self, latents, return_dict=False): - output = self.vae(latents)[0] - output = torch.from_numpy(output) - - return [output] - -Replace the PyTorch model objects in original pipeline with OpenVINO -models - -.. code:: ipython3 - - pipe.id_encoder = OVIDEncoderWrapper(compiled_id_encoder, pipe.id_encoder) - pipe.unet = OVUnetWrapper(compiled_unet, pipe.unet) - pipe.text_encoder = OVTextEncoderWrapper(compiled_text_encoder, pipe.text_encoder) - pipe.text_encoder_2 = OVTextEncoderWrapper(compiled_text_encoder_2, pipe.text_encoder_2) - pipe.vae = OVVAEDecoderWrapper(compiled_vae_decoder, pipe.vae) - -Running Text-to-Image Generation with OpenVINO ----------------------------------------------- - - - -.. code:: ipython3 - - from diffusers.utils import load_image - - prompt = "sci-fi, closeup portrait photo of a man img in Iron man suit, face" - negative_prompt = "(asymmetry, worst quality, low quality, illustration, 3d, 2d, painting, cartoons, sketch), open mouth" - generator = torch.Generator("cpu").manual_seed(42) - - input_id_images = [] - original_image = load_image("./PhotoMaker/examples/newton_man/newton_0.jpg") - input_id_images.append(original_image) - - ## Parameter setting - num_steps = 20 - style_strength_ratio = 20 - start_merge_step = int(float(style_strength_ratio) / 100 * num_steps) - if start_merge_step > 30: - start_merge_step = 30 - - images = pipe( - prompt=prompt, - input_id_images=input_id_images, - negative_prompt=negative_prompt, - num_images_per_prompt=1, - num_inference_steps=num_steps, - start_merge_step=start_merge_step, - generator=generator, - ).images - - - -.. parsed-literal:: - - 0%| | 0/20 [00:00 30: - start_merge_step = 30 - result = pipe( - text_promt, - input_id_images=input_image, - negative_prompt=neg_prompt, - num_inference_steps=num_steps, - num_images_per_prompt=1, - start_merge_step=start_merge_step, - generator=torch.Generator().manual_seed(seed), - height=1024, - width=1024, - ).images[0] - - return result - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/photo-maker/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=generate_from_text) - - try: - demo.queue().launch(debug=True) - except Exception: - demo.queue().launch(debug=True, share=True) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ - -.. code:: ipython3 - - # please uncomment and run this cell for stopping gradio interface - # demo.close() diff --git a/docs/notebooks/photo-maker-with-output_files/photo-maker-with-output_34_0.png b/docs/notebooks/photo-maker-with-output_files/photo-maker-with-output_34_0.png deleted file mode 100644 index 49b5d8b1ab408a..00000000000000 --- a/docs/notebooks/photo-maker-with-output_files/photo-maker-with-output_34_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1ac050b7dd1bc3f2dac9c0bd31f0fa559355b892822eb378c9f6f805a657ee96 -size 357849 diff --git a/docs/notebooks/pix2struct-docvqa-with-output.rst b/docs/notebooks/pix2struct-docvqa-with-output.rst deleted file mode 100644 index 84743878b17fa4..00000000000000 --- a/docs/notebooks/pix2struct-docvqa-with-output.rst +++ /dev/null @@ -1,321 +0,0 @@ -Document Visual Question Answering Using Pix2Struct and OpenVINO™ -================================================================= - -DocVQA (Document Visual Question Answering) is a research field in -computer vision and natural language processing that focuses on -developing algorithms to answer questions related to the content of a -document represented in image format, like a scanned document, -screenshots, or an image of a text document. Unlike other types of -visual question answering, where the focus is on answering questions -related to images or videos, DocVQA is focused on understanding and -answering questions based on the text and layout of a document. The -questions can be about any aspect of the document text. DocVQA requires -understanding the document’s visual content and the ability to read and -comprehend the text in it. - -DocVQA offers several benefits compared to OCR (Optical Character -Recognition) technology: \* Firstly, DocVQA can not only recognize and -extract text from a document, but it can also understand the context in -which the text appears. This means it can answer questions about the -document’s content rather than simply provide a digital version. \* -Secondly, DocVQA can handle documents with complex layouts and -structures, like tables and diagrams, which can be challenging for -traditional OCR systems. \* Finally, DocVQA can automate many -document-based workflows, like document routing and approval processes, -to make employees focus on more meaningful work. The potential -applications of DocVQA include automating tasks like information -retrieval, document analysis, and document summarization. - -`Pix2Struct `__ is a multimodal -model for understanding visually situated language that easily copes -with extracting information from images. The model is trained using the -novel learning technique to parse masked screenshots of web pages into -simplified HTML, providing a significantly well-suited pretraining data -source for the range of downstream activities such as OCR, visual -question answering, and image captioning. - -In this tutorial, we consider how to run the Pix2Struct model using -OpenVINO for solving document visual question answering task. We will -use a pre-trained model from the `Hugging Face -Transformers `__ -library. To simplify the user experience, the `Hugging Face -Optimum `__ library is used to -convert the model to OpenVINO™ IR format. - - -**Table of contents:** - - -- `About Pix2Struct <#about-pix2struct>`__ -- `Prerequisites <#prerequisites>`__ -- `Download and Convert Model <#download-and-convert-model>`__ -- `Select inference device <#select-inference-device>`__ -- `Test model inference <#test-model-inference>`__ -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -About Pix2Struct ----------------- - - - -Pix2Struct is an image encoder - text decoder model that is trained on -image-text pairs for various tasks, including image captioning and -visual question answering. The model combines the simplicity of purely -pixel-level inputs with the generality and scalability provided by -self-supervised pretraining from diverse and abundant web data. The -model does this by recommending a screenshot parsing objective that -needs predicting an HTML-based parse from a screenshot of a web page -that has been partially masked. With the diversity and complexity of -textual and visual elements found on the web, Pix2Struct learns rich -representations of the underlying structure of web pages, which can -effectively transfer to various downstream visual language understanding -tasks. - -Pix2Struct is based on the Vision Transformer (ViT), an -image-encoder-text-decoder model with changes in input representation to -make the model more robust to processing images with various aspect -ratios. Standard ViT extracts fixed-size patches after scaling input -images to a predetermined resolution. This distorts the proper aspect -ratio of the image, which can be highly variable for documents, mobile -UIs, and figures. Pix2Struct proposes to scale the input image up or -down to extract the maximum number of patches that fit within the given -sequence length. This approach is more robust to extreme aspect ratios, -common in the domains Pix2Struct experiments with. Additionally, the -model can handle on-the-fly changes to the sequence length and -resolution. To handle variable resolutions unambiguously, 2-dimensional -absolute positional embeddings are used for the input patches. - -Prerequisites -------------- - - - -First, we need to install the `Hugging Face -Optimum `__ library -accelerated by OpenVINO integration. The Hugging Face Optimum API is a -high-level API that enables us to convert and quantize models from the -Hugging Face Transformers library to the OpenVINO™ IR format. For more -details, refer to the `Hugging Face Optimum -documentation `__. - -.. code:: ipython3 - - %pip install -q "torch>=2.1" torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" "openvino>=2023.1.0" "onnx<1.16.2" "gradio>=4.19" --extra-index-url https://download.pytorch.org/whl/cpu - -Download and Convert Model --------------------------- - - - -Optimum Intel can be used to load optimized models from the `Hugging -Face Hub `__ and -create pipelines to run an inference with OpenVINO Runtime using Hugging -Face APIs. The Optimum Inference models are API compatible with Hugging -Face Transformers models. This means we just need to replace the -``AutoModelForXxx`` class with the corresponding ``OVModelForXxx`` -class. - -Model class initialization starts with calling the ``from_pretrained`` -method. When downloading and converting the Transformers model, the -parameter ``export=True`` should be added. We can save the converted -model for the next usage with the ``save_pretrained`` method. After -model saving using the ``save_pretrained`` method, you can load your -converted model without the ``export`` parameter, avoiding model -conversion for the next time. For reducing memory consumption, we can -compress model to float16 using ``half()`` method. - -In this tutorial, we separate model export and loading for a -demonstration of how to work with the model in both modes. We will use -the -`pix2struct-docvqa-base `__ -model as an example in this tutorial, but the same steps for running are -applicable for other models from pix2struct family. - -.. code:: ipython3 - - import gc - from pathlib import Path - from optimum.intel.openvino import OVModelForPix2Struct - - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("pix2struct-docvqa.ipynb") - - model_id = "google/pix2struct-docvqa-base" - model_dir = Path(model_id.split("/")[-1]) - - if not model_dir.exists(): - ov_model = OVModelForPix2Struct.from_pretrained(model_id, export=True, compile=False) - ov_model.half() - ov_model.save_pretrained(model_dir) - del ov_model - gc.collect(); - -Select inference device ------------------------ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Test model inference --------------------- - - - -The diagram below demonstrates how the model works: -|pix2struct_diagram.png| - -For running model inference we should preprocess data first. -``Pix2StructProcessor`` is responsible for preparing input data and -decoding output for the original PyTorch model and easily can be reused -for running with the Optimum Intel model. Then -``OVModelForPix2Struct.generate`` method will launch answer generation. -Finally, generated answer token indices should be decoded in text format -by ``Pix2StructProcessor.decode`` - -.. |pix2struct_diagram.png| image:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/c7456b17-0687-4aa9-851b-267bff3dac79 - -.. code:: ipython3 - - from transformers import Pix2StructProcessor - - processor = Pix2StructProcessor.from_pretrained(model_id) - ov_model = OVModelForPix2Struct.from_pretrained(model_dir, device=device.value) - - -.. parsed-literal:: - - Compiling the encoder to AUTO ... - Compiling the decoder to AUTO ... - Compiling the decoder to AUTO ... - - -Let’s see the model in action. For testing the model, we will use a -screenshot from `OpenVINO -documentation `__ - -.. code:: ipython3 - - import requests - from PIL import Image - from io import BytesIO - - - def load_image(image_file): - if isinstance(image_file, str) and image_file.startswith("https://"): - response = requests.get(image_file) - image = Image.open(BytesIO(response.content)) - else: - image = Image.open(image_file) - return image.convert("RGB") - - - test_image_path = Path("ov_docs.png") - test_image_url = "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/aa46ef0c-c14d-4bab-8bb7-3b22fe73f6bc" - - if not test_image_path.exists(): - image = load_image(test_image_url) - image.save(test_image_path) - else: - image = load_image(test_image_path) - - text = "What performance hints do?" - - inputs = processor(images=image, text=text, return_tensors="pt") - display(image) - - - -.. image:: pix2struct-docvqa-with-output_files/pix2struct-docvqa-with-output_11_0.png - - -.. code:: ipython3 - - answer_tokens = ov_model.generate(**inputs) - answer = processor.decode(answer_tokens[0], skip_special_tokens=True) - print(f"Question: {text}") - print(f"Answer: {answer}") - - -.. parsed-literal:: - - /home/ea/work/ov_venv/lib/python3.8/site-packages/optimum/intel/openvino/modeling_seq2seq.py:395: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly. - last_hidden_state = torch.from_numpy(self.request(inputs, shared_memory=True)["last_hidden_state"]).to( - /home/ea/work/ov_venv/lib/python3.8/site-packages/transformers/generation/utils.py:1260: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation. - warnings.warn( - /home/ea/work/ov_venv/lib/python3.8/site-packages/optimum/intel/openvino/modeling_seq2seq.py:476: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly. - self.request.start_async(inputs, shared_memory=True) - - -.. parsed-literal:: - - Question: What performance hints do? - Answer: automatically adjust runtime parameters to prioritize for low latency or high throughput - - -Interactive demo ----------------- - - - -.. code:: ipython3 - - def generate(img, question): - inputs = processor(images=img, text=question, return_tensors="pt") - predictions = ov_model.generate(**inputs, max_new_tokens=256) - return processor.decode(predictions[0], skip_special_tokens=True) - - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/pix2struct-docvqa/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=generate) - - try: - demo.queue().launch(debug=False) - except Exception: - demo.queue().launch(share=True, debug=False) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/pix2struct-docvqa-with-output_files/pix2struct-docvqa-with-output_11_0.jpg b/docs/notebooks/pix2struct-docvqa-with-output_files/pix2struct-docvqa-with-output_11_0.jpg deleted file mode 100644 index 6e51fa5a49d9ac..00000000000000 --- a/docs/notebooks/pix2struct-docvqa-with-output_files/pix2struct-docvqa-with-output_11_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c25269a3a1e49d96f9c95144211ee2614ca41782cc870f86fd758c4b9fb5d6a5 -size 134092 diff --git a/docs/notebooks/pix2struct-docvqa-with-output_files/pix2struct-docvqa-with-output_11_0.png b/docs/notebooks/pix2struct-docvqa-with-output_files/pix2struct-docvqa-with-output_11_0.png deleted file mode 100644 index 2ed53c36850f65..00000000000000 --- a/docs/notebooks/pix2struct-docvqa-with-output_files/pix2struct-docvqa-with-output_11_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a9fa291a981591490c2e4b97a709e750c1eb7fc734dd8645888bdb97ccb935a6 -size 221889 diff --git a/docs/notebooks/pixart-with-output.rst b/docs/notebooks/pixart-with-output.rst deleted file mode 100644 index 171a4730b347f9..00000000000000 --- a/docs/notebooks/pixart-with-output.rst +++ /dev/null @@ -1,680 +0,0 @@ -PixArt-α: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis with OpenVINO -========================================================================================================= - -`This paper `__ introduces -`PIXART-α `__, a -Transformer-based T2I diffusion model whose image generation quality is -competitive with state-of-the-art image generators, reaching -near-commercial application standards. Additionally, it supports -high-resolution image synthesis up to 1024px resolution with low -training cost. To achieve this goal, three core designs are proposed: 1. -Training strategy decomposition: We devise three distinct training steps -that separately optimize pixel dependency, text-image alignment, and -image aesthetic quality; 2. Efficient T2I Transformer: We incorporate -cross-attention modules into Diffusion Transformer (DiT) to inject text -conditions and streamline the computation-intensive class-condition -branch; 3. High-informative data: We emphasize the significance of -concept density in text-image pairs and leverage a large Vision-Language -model to auto-label dense pseudo-captions to assist text-image alignment -learning. - -.. image:: https://huggingface.co/PixArt-alpha/PixArt-XL-2-1024-MS/resolve/main/asset/images/teaser.png - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Load the original model <#load-the-original-model>`__ -- `Convert the model to OpenVINO - IR <#convert-the-model-to-openvino-ir>`__ - - - `Convert text encoder <#convert-text-encoder>`__ - - `Convert transformer <#convert-transformer>`__ - - `Convert VAE decoder <#convert-vae-decoder>`__ - -- `Compiling models <#compiling-models>`__ -- `Building the pipeline <#building-the-pipeline>`__ -- `Quantization <#quantization>`__ - - - `Prepare calibration dataset <#prepare-calibration-dataset>`__ - - `Run quantization <#run-quantization>`__ - - - `Run Hybrid Quantization <#run-hybrid-quantization>`__ - - `Run Weights Compression <#run-weights-compression>`__ - - - `Compare model file sizes <#compare-model-file-sizes>`__ - - `Compare inference time of the FP16 and optimized - pipelines <#compare-inference-time-of-the-fp16-and-optimized-pipelines>`__ - -- `Interactive inference <#interactive-inference>`__ - - - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "diffusers>=0.14.0" sentencepiece "datasets>=2.14.6" "transformers>=4.25.1" "gradio>=4.19" "torch>=2.1" Pillow opencv-python --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -Uq "openvino>=2024.3.0" - -.. code:: ipython3 - - import requests - from pathlib import Path - - - if not Path("pixart_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/pixart/pixart_helper.py") - open("pixart_helper.py", "w").write(r.text) - - if not Path("pixart_quantization_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/pixart/pixart_quantization_helper.py") - open("pixart_quantization_helper.py", "w").write(r.text) - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("pixart.ipynb") - -Load and run the original pipeline ----------------------------------- - - - -We use -`PixArt-LCM-XL-2-1024-MS `__ -that uses LCMs. `LCMs `__ is a -diffusion distillation method which predict ``PF-ODE's`` solution -directly in latent space, achieving super fast inference with few steps. - -.. code:: ipython3 - - import torch - from diffusers import PixArtAlphaPipeline - - - pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-LCM-XL-2-1024-MS", use_safetensors=True) - - prompt = "A small cactus with a happy face in the Sahara desert." - generator = torch.Generator().manual_seed(42) - - image = pipe(prompt, guidance_scale=0.0, num_inference_steps=4, generator=generator).images[0] - -.. code:: ipython3 - - image - - - - -.. image:: pixart-with-output_files/pixart-with-output_6_0.png - - - -Convert the model to OpenVINO IR --------------------------------- - - - -Let’s define the conversion function for PyTorch modules. We use -``ov.convert_model`` function to obtain OpenVINO Intermediate -Representation object and ``ov.save_model`` function to save it as XML -file. - -.. code:: ipython3 - - import torch - import openvino as ov - - - def convert(model: torch.nn.Module, xml_path: str, example_input): - xml_path = Path(xml_path) - if not xml_path.exists(): - xml_path.parent.mkdir(parents=True, exist_ok=True) - model.eval() - with torch.no_grad(): - converted_model = ov.convert_model(model, example_input=example_input) - ov.save_model(converted_model, xml_path) - - # cleanup memory - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - -PixArt-α consists of pure transformer blocks for latent diffusion: It -can directly generate 1024px images from text prompts within a single -sampling process. - -|image1|. - -During inference it uses text encoder ``T5EncoderModel``, transformer -``Transformer2DModel`` and VAE decoder ``AutoencoderKL``. Let’s convert -the models from the pipeline one by one. - -.. |image1| image:: https://huggingface.co/PixArt-alpha/PixArt-XL-2-1024-MS/resolve/main/asset/images/model.png - -.. code:: ipython3 - - from pixart_helper import TEXT_ENCODER_PATH, TRANSFORMER_OV_PATH, VAE_DECODER_PATH - -Convert text encoder -~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - example_input = { - "input_ids": torch.zeros(1, 120, dtype=torch.int64), - "attention_mask": torch.zeros(1, 120, dtype=torch.int64), - } - - convert(pipe.text_encoder, TEXT_ENCODER_PATH, example_input) - -Convert transformer -~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - class TransformerWrapper(torch.nn.Module): - def __init__(self, transformer): - super().__init__() - self.transformer = transformer - - def forward(self, hidden_states=None, timestep=None, encoder_hidden_states=None, encoder_attention_mask=None, resolution=None, aspect_ratio=None): - return self.transformer.forward( - hidden_states, - timestep=timestep, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, - added_cond_kwargs={"resolution": resolution, "aspect_ratio": aspect_ratio}, - ) - - - example_input = { - "hidden_states": torch.rand([2, 4, 128, 128], dtype=torch.float32), - "timestep": torch.tensor([999, 999]), - "encoder_hidden_states": torch.rand([2, 120, 4096], dtype=torch.float32), - "encoder_attention_mask": torch.rand([2, 120], dtype=torch.float32), - "resolution": torch.tensor([[1024.0, 1024.0], [1024.0, 1024.0]]), - "aspect_ratio": torch.tensor([[1.0], [1.0]]), - } - - - w_transformer = TransformerWrapper(pipe.transformer) - convert(w_transformer, TRANSFORMER_OV_PATH, example_input) - -Convert VAE decoder -~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - class VAEDecoderWrapper(torch.nn.Module): - def __init__(self, vae): - super().__init__() - self.vae = vae - - def forward(self, latents): - return self.vae.decode(latents, return_dict=False) - - - convert(VAEDecoderWrapper(pipe.vae), VAE_DECODER_PATH, (torch.zeros((1, 4, 128, 128)))) - -Compiling models ----------------- - - - -Select device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - core = ov.Core() - - compiled_model = core.compile_model(TRANSFORMER_OV_PATH, device.value) - compiled_vae = core.compile_model(VAE_DECODER_PATH, device.value) - compiled_text_encoder = core.compile_model(TEXT_ENCODER_PATH, device.value) - -Building the pipeline ---------------------- - - - -Let’s create callable wrapper classes for compiled models to allow -interaction with original pipelines. Note that all of wrapper classes -return ``torch.Tensor``\ s instead of ``np.array``\ s. - -.. code:: ipython3 - - from collections import namedtuple - - EncoderOutput = namedtuple("EncoderOutput", "last_hidden_state") - - - class TextEncoderWrapper(torch.nn.Module): - def __init__(self, text_encoder, dtype): - super().__init__() - self.text_encoder = text_encoder - self.dtype = dtype - - def forward(self, input_ids=None, attention_mask=None): - inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - } - last_hidden_state = self.text_encoder(inputs)[0] - return EncoderOutput(torch.from_numpy(last_hidden_state)) - -.. code:: ipython3 - - class TransformerWrapper(torch.nn.Module): - def __init__(self, transformer, config): - super().__init__() - self.transformer = transformer - self.config = config - - def forward( - self, - hidden_states=None, - timestep=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - resolution=None, - aspect_ratio=None, - added_cond_kwargs=None, - **kwargs - ): - inputs = { - "hidden_states": hidden_states, - "timestep": timestep, - "encoder_hidden_states": encoder_hidden_states, - "encoder_attention_mask": encoder_attention_mask, - } - resolution = added_cond_kwargs["resolution"] - aspect_ratio = added_cond_kwargs["aspect_ratio"] - if resolution is not None: - inputs["resolution"] = resolution - inputs["aspect_ratio"] = aspect_ratio - outputs = self.transformer(inputs)[0] - - return [torch.from_numpy(outputs)] - -.. code:: ipython3 - - class VAEWrapper(torch.nn.Module): - def __init__(self, vae, config): - super().__init__() - self.vae = vae - self.config = config - - def decode(self, latents=None, **kwargs): - inputs = { - "latents": latents, - } - - outs = self.vae(inputs) - outs = namedtuple("VAE", "sample")(torch.from_numpy(outs[0])) - - return outs - -And insert wrappers instances in the pipeline: - -.. code:: ipython3 - - pipe.__dict__["_internal_dict"]["_execution_device"] = pipe._execution_device # this is to avoid some problem that can occur in the pipeline - - pipe.register_modules( - text_encoder=TextEncoderWrapper(compiled_text_encoder, pipe.text_encoder.dtype), - transformer=TransformerWrapper(compiled_model, pipe.transformer.config), - vae=VAEWrapper(compiled_vae, pipe.vae.config), - ) - -.. code:: ipython3 - - generator = torch.Generator().manual_seed(42) - - image = pipe(prompt=prompt, guidance_scale=0.0, num_inference_steps=4, generator=generator).images[0] - -.. code:: ipython3 - - image - - - - -.. image:: pixart-with-output_files/pixart-with-output_27_0.png - - - -Quantization ------------- - - - -`NNCF `__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -According to ``PixArt-LCM-XL-2-1024-MS`` structure, -``Transformer2DModel`` is used in the cycle repeating inference on each -diffusion step, while other parts of pipeline take part only once. -Quantizing the rest of the pipeline does not significantly improve -inference performance but can lead to a substantial degradation of -accuracy. That’s why we use only weight compression in 4-bits for the -``text encoder`` and ``vae decoder`` to reduce the memory footprint. Now -we will show you how to optimize pipeline using -`NNCF `__ to reduce memory and -computation cost. - -Please select below whether you would like to run quantization to -improve model inference speed. - - **NOTE**: Quantization is time and memory consuming operation. - Running quantization code below may take some time. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - to_quantize = quantization_widget() - - to_quantize - -Let’s load ``skip magic`` extension to skip quantization if -``to_quantize`` is not selected - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - optimized_pipe = None - - %load_ext skip_kernel_extension - -Prepare calibration dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We use a portion of -`google-research-datasets/conceptual_captions `__ -dataset from Hugging Face as calibration data. We use prompts below to -guide image generation and to determine what not to include in the -resulting image. - -To collect intermediate model inputs for calibration we should customize -``CompiledModel``. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from pixart_quantization_helper import INT8_TRANSFORMER_OV_PATH, INT4_TEXT_ENCODER_PATH, INT4_VAE_DECODER_PATH, collect_calibration_data - - if not INT8_TRANSFORMER_OV_PATH.exists(): - subset_size = 100 - calibration_data = collect_calibration_data(pipe, subset_size=subset_size) - -Run Hybrid Quantization -~~~~~~~~~~~~~~~~~~~~~~~ - - - -For the ``Transformer2DModel`` model we apply quantization in hybrid -mode which means that we quantize: (1) weights of MatMul and Embedding -layers and (2) activations of other layers. The steps are the following: - -1. Create a calibration dataset for quantization. -2. Collect operations with weights. -3. Run nncf.compress_model() to compress only the model weights. -4. Run nncf.quantize() on the compressed model with weighted operations - ignored by providing ignored_scope parameter. -5. Save the INT8 model using openvino.save_model() function. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import nncf - from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters - from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters - from pixart_quantization_helper import get_quantization_ignored_scope - - if not INT8_TRANSFORMER_OV_PATH.exists(): - model = core.read_model(TRANSFORMER_OV_PATH) - ignored_scope = get_quantization_ignored_scope(model) - # The convolution operations will be fully quantized - compressed_model = nncf.compress_weights(model, ignored_scope=nncf.IgnoredScope(types=['Convolution'])) - quantized_model = nncf.quantize( - model=compressed_model, - calibration_dataset=nncf.Dataset(calibration_data), - subset_size=subset_size, - ignored_scope=nncf.IgnoredScope(names=ignored_scope), - model_type=nncf.ModelType.TRANSFORMER, - # Disable SQ because MatMul weights are already compressed - advanced_parameters=AdvancedQuantizationParameters(smooth_quant_alphas=AdvancedSmoothQuantParameters(matmul=-1)) - ) - ov.save_model(quantized_model, INT8_TRANSFORMER_OV_PATH) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -Run Weights Compression -~~~~~~~~~~~~~~~~~~~~~~~ - - - -Quantizing of the ``T5EncoderModel`` and ``AutoencoderKL`` does not -significantly improve inference performance but can lead to a -substantial degradation of accuracy. The weight compression will be -applied to footprint reduction. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - if not INT4_TEXT_ENCODER_PATH.exists(): - text_encoder = core.read_model(TEXT_ENCODER_PATH) - compressed_text_encoder = nncf.compress_weights(text_encoder, mode=nncf.CompressWeightsMode.INT4_SYM) - ov.save_model(compressed_text_encoder, INT4_TEXT_ENCODER_PATH) - - if not INT4_VAE_DECODER_PATH.exists(): - vae_decoder = core.read_model(VAE_DECODER_PATH) - compressed_vae_decoder = nncf.compress_weights(vae_decoder, mode=nncf.CompressWeightsMode.INT4_SYM) - ov.save_model(compressed_vae_decoder, INT4_VAE_DECODER_PATH) - -Let’s compare the images generated by the original and optimized -pipelines. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - # Disable dynamic quantization due to the performance overhead for Diffusion models - optimized_transformer = core.compile_model(INT8_TRANSFORMER_OV_PATH, device.value, config={"DYNAMIC_QUANTIZATION_GROUP_SIZE":"0"}) - optimized_text_encoder = core.compile_model(INT4_TEXT_ENCODER_PATH, device.value) - optimized_vae_decoder = core.compile_model(INT4_VAE_DECODER_PATH, device.value) - - optimized_pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-LCM-XL-2-1024-MS", use_safetensors=True) - optimized_pipe.__dict__["_internal_dict"]["_execution_device"] = optimized_pipe._execution_device # this is to avoid some problem that can occur in the pipeline - - optimized_pipe.register_modules( - text_encoder=TextEncoderWrapper(optimized_text_encoder, optimized_pipe.text_encoder.dtype), - transformer=TransformerWrapper(optimized_transformer, optimized_pipe.transformer.config), - vae=VAEWrapper(optimized_vae_decoder, optimized_pipe.vae.config), - ) - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from pixart_quantization_helper import visualize_results - - prompt = "A small cactus with a happy face in the Sahara desert." - generator = torch.Generator().manual_seed(42) - opt_image = optimized_pipe(prompt=prompt, guidance_scale=0.0, num_inference_steps=4, generator=generator).images[0] - - visualize_results(image, opt_image) - - -.. parsed-literal:: - - /home/ltalamanova/tmp_venv/lib/python3.11/site-packages/diffusers/configuration_utils.py:140: FutureWarning: Accessing config attribute `_execution_device` directly via 'PixArtAlphaPipeline' object attribute is deprecated. Please access '_execution_device' over 'PixArtAlphaPipeline's config object instead, e.g. 'scheduler.config._execution_device'. - deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False) - - - -.. parsed-literal:: - - 0%| | 0/4 [00:00`__ and `model -card `__ - -In this tutorial we consider how to convert, optimize and run this model -using OpenVINO. - -.. warning:: - - Important note: Please take into account that pixtral is large model. - Its conversion requires at least 50GB disk space available - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Convert and Optimize model <#convert-and-optimize-model>`__ -- `Run model inference <#run-model-inference>`__ - - - `Select inference device <#select-inference-device>`__ - - `Initialize inference pipeline <#initialize-inference-pipeline>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "torch>=2.1" torchvision "pillow" "tqdm" "gradio>=4.36" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "nncf>=2.13.0" "openvino>=2024.4" - %pip install -q "transformers>=4.45.0" --extra-index-url https://download.pytorch.org/whl/cpu - -.. code:: ipython3 - - from pathlib import Path - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("notebook_utils.py", "w").write(r.text) - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/refs/heads/latest/notebooks/pixtral/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("pixtral.ipynb") - -Convert and Optimize model --------------------------- - - - -For convenience, we will use OpenVINO integration with HuggingFace -Optimum. `Optimum -Intel `__ is the -interface between the Transformers and Diffusers libraries and the -different tools and libraries provided by Intel to accelerate end-to-end -pipelines on Intel architectures. - -Among other use cases, Optimum Intel provides a simple interface to -optimize your Transformers and Diffusers models, convert them to the -OpenVINO Intermediate Representation (IR) format and run inference using -OpenVINO Runtime. ``optimum-cli`` provides command line interface for -model conversion and optimization. - -General command format: - -.. code:: bash - - optimum-cli export openvino --model --task - -where task is task to export the model for, if not specified, the task -will be auto-inferred based on the model. You can find a mapping between -tasks and model classes in Optimum TaskManager -`documentation `__. -Additionally, you can specify weights compression using -``--weight-format`` argument with one of following options: ``fp32``, -``fp16``, ``int8`` and ``int4``. For int8 and int4 -`nncf `__ will be used for -weight compression. More details about model export provided in `Optimum -Intel -documentation `__. - -.. code:: ipython3 - - import ipywidgets as widgets - - model_base_dir = Path("pixtral-12b") - - precisions = ["FP16", "INT8", "INT4"] - - precision_selector = widgets.Dropdown(description="compression", options=precisions, value=precisions[-1]) - - precision_selector - - - - -.. parsed-literal:: - - Dropdown(description='compression', index=2, options=('FP16', 'INT8', 'INT4'), value='INT4') - - - -.. code:: ipython3 - - model_dir = model_base_dir / precision_selector.value - - if not (model_dir / "openvino_language_model.xml").exists(): - !optimum-cli export openvino -m "mistral-community/pixtral-12b" --weight-format {precision_selector.value.lower()} {model_dir} - -Run model inference -------------------- - - - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget(default="CPU", exclude=["NPU"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -Initialize inference pipeline -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -OpenVINO integration with Optimum Intel provides ready-to-use API for -model inference that can be used for smooth integration with -transformers-based solutions. For loading pixtral model, we will use -``OVModelForVisualCausalLM`` class that have compatible interface with -Transformers Pixtral implementation. For loading a model, -``from_pretrained`` method should be used. It accepts path to the model -directory or model_id from HuggingFace hub (if model is not converted to -OpenVINO format, conversion will be triggered automatically). -Additionally, we can provide an inference device, quantization config -(if model has not been quantized yet) and device-specific OpenVINO -Runtime configuration. More details about model inference with Optimum -Intel can be found in -`documentation `__. - -.. code:: ipython3 - - from transformers import AutoProcessor - from optimum.intel.openvino import OVModelForVisualCausalLM - - processor = AutoProcessor.from_pretrained(model_dir) - ov_model = OVModelForVisualCausalLM.from_pretrained(model_dir, device=device.value) - - -.. parsed-literal:: - - 2024-10-02 18:03:58.850094: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-10-02 18:03:58.851883: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used. - 2024-10-02 18:03:58.888025: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-10-02 18:03:59.657376: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - Compiling the Language model to CPU ... - Compiling the Text embeddings model to CPU ... - Compiling the vision_embeddings to CPU ... - - -.. code:: ipython3 - - from PIL import Image - from transformers import TextStreamer - from gradio_helper import chat_template, resize_with_aspect_ratio - - if processor.chat_template is None: - processor.set_chat_template(chat_template) - - question = "What is unusual on this image?" - - messages = [ - {"role": "user", "content": [{"type": "text", "content": question}, {"type": "image"}]}, - ] - text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) - image_path = Path("cat.png") - - if not image_path.exists(): - url = "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11" - raw_image = Image.open(requests.get(url, stream=True).raw) - raw_image.save("cat.png") - else: - raw_image = Image.open(image_path) - - inputs = processor(text=text, images=[resize_with_aspect_ratio(raw_image)], return_tensors="pt") - streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True) - print(f"Question: {question}") - display(raw_image) - output = ov_model.generate(**inputs, do_sample=False, max_new_tokens=100, temperature=None, top_p=None, streamer=streamer) - - -.. parsed-literal:: - - Question: What is unusual on this image? - - - -.. image:: pixtral-with-output_files/pixtral-with-output_12_1.png - - -.. parsed-literal:: - - Setting `pad_token_id` to `eos_token_id`:None for open-end generation. - - -.. parsed-literal:: - - The unusual aspect of this image is the presence of a cat and a dog lying together peacefully inside a cardboard box. This is not a common sight, as cats and dogs are often perceived as being natural enemies or at least not inclined to share spaces closely. The image portrays a harmonious and playful interaction between the two animals, which challenges typical stereotypes about their relationship. - - -Interactive demo ----------------- - - - -.. code:: ipython3 - - from gradio_helper import make_demo - - demo = make_demo(ov_model, processor) - - try: - demo.launch(debug=True) - except Exception: - demo.launch(debug=True, share=True) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/pixtral-with-output_files/pixtral-with-output_12_1.jpg b/docs/notebooks/pixtral-with-output_files/pixtral-with-output_12_1.jpg deleted file mode 100644 index c6aeec77cd3cb2..00000000000000 --- a/docs/notebooks/pixtral-with-output_files/pixtral-with-output_12_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5fc0d22d75f23474fb4f8aec8c0bf0fdf5d9377f3379e82a3887003e6da47e7e -size 60425 diff --git a/docs/notebooks/pixtral-with-output_files/pixtral-with-output_12_1.png b/docs/notebooks/pixtral-with-output_files/pixtral-with-output_12_1.png deleted file mode 100644 index c6673a757ab5dc..00000000000000 --- a/docs/notebooks/pixtral-with-output_files/pixtral-with-output_12_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c715d8adee4bf7519690de20b57ef2edaa2f914c86a64d107f99a919dcdad218 -size 854224 diff --git a/docs/notebooks/pose-estimation-with-output.rst b/docs/notebooks/pose-estimation-with-output.rst deleted file mode 100644 index 0c3a098c553679..00000000000000 --- a/docs/notebooks/pose-estimation-with-output.rst +++ /dev/null @@ -1,827 +0,0 @@ -Live Human Pose Estimation with OpenVINO™ -========================================= - -This notebook demonstrates live pose estimation with OpenVINO, using the -OpenPose -`human-pose-estimation-0001 `__ -model from `Open Model -Zoo `__. Final part -of this notebook shows live inference results from a webcam. -Additionally, you can also upload a video file. - - **NOTE**: To use a webcam, you must run this Jupyter notebook on a - computer with a webcam. If you run on a server, the webcam will not - work. However, you can still do inference on a video in the final - step. - - -**Table of contents:** - - -- `Imports <#imports>`__ -- `The model <#the-model>`__ - - - `Download the model <#download-the-model>`__ - - `Load the model <#load-the-model>`__ - -- `Processing <#processing>`__ - - - `OpenPose Decoder <#openpose-decoder>`__ - - `Process Results <#process-results>`__ - - `Draw Pose Overlays <#draw-pose-overlays>`__ - - `Main Processing Function <#main-processing-function>`__ - -- `Run <#run>`__ - - - `Run Live Pose Estimation <#run-live-pose-estimation>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - %pip install -q "openvino>=2023.1.0" opencv-python tqdm - -Imports -------- - - - -.. code:: ipython3 - - import collections - import time - from pathlib import Path - - import cv2 - import numpy as np - from IPython import display - from numpy.lib.stride_tricks import as_strided - import openvino as ov - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - import notebook_utils as utils - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("pose-estimation.ipynb") - -The model ---------- - - - -Download the model -~~~~~~~~~~~~~~~~~~ - - - -Use the ``download_file``, a function from the ``notebook_utils`` file. -It automatically creates a directory structure and downloads the -selected model. - -If you want to download another model, replace the name of the model and -precision in the code below. - - **NOTE**: This may require a different pose decoder. - -.. code:: ipython3 - - # A directory where the model will be downloaded. - base_model_dir = Path("model") - - # The name of the model from Open Model Zoo. - model_name = "human-pose-estimation-0001" - # Selected precision (FP32, FP16, FP16-INT8). - precision = "FP16-INT8" - - model_path = base_model_dir / "intel" / model_name / precision / f"{model_name}.xml" - - if not model_path.exists(): - model_url_dir = f"https://storage.openvinotoolkit.org/repositories/open_model_zoo/2022.1/models_bin/3/{model_name}/{precision}/" - utils.download_file(model_url_dir + model_name + ".xml", model_path.name, model_path.parent) - utils.download_file( - model_url_dir + model_name + ".bin", - model_path.with_suffix(".bin").name, - model_path.parent, - ) - -Load the model -~~~~~~~~~~~~~~ - - - -Downloaded models are located in a fixed structure, which indicates a -vendor, the name of the model and a precision. - -Only a few lines of code are required to run the model. First, -initialize OpenVINO Runtime. Then, read the network architecture and -model weights from the ``.bin`` and ``.xml`` files to compile it for the -desired device. Select device from dropdown list for running inference -using OpenVINO. - -.. code:: ipython3 - - device = utils.device_widget() - - device - -.. code:: ipython3 - - import openvino.properties.hint as hints - - - # Initialize OpenVINO Runtime - core = ov.Core() - # Read the network from a file. - model = core.read_model(model_path) - # Let the AUTO device decide where to load the model (you can use CPU, GPU as well). - compiled_model = core.compile_model(model=model, device_name=device.value, config={hints.performance_mode(): hints.PerformanceMode.LATENCY}) - - # Get the input and output names of nodes. - input_layer = compiled_model.input(0) - output_layers = compiled_model.outputs - - # Get the input size. - height, width = list(input_layer.shape)[2:] - -Input layer has the name of the input node and output layers contain -names of output nodes of the network. In the case of OpenPose Model, -there is 1 input and 2 outputs: PAFs and keypoints heatmap. - -.. code:: ipython3 - - input_layer.any_name, [o.any_name for o in output_layers] - -OpenPose Decoder -~~~~~~~~~~~~~~~~ - - - -To transform the raw results from the neural network into pose -estimations, you need OpenPose Decoder. It is provided in the `Open -Model -Zoo `__ -and compatible with the ``human-pose-estimation-0001`` model. - -If you choose a model other than ``human-pose-estimation-0001`` you will -need another decoder (for example, ``AssociativeEmbeddingDecoder``), -which is available in the `demos -section `__ -of Open Model Zoo. - -.. code:: ipython3 - - # code from https://github.com/openvinotoolkit/open_model_zoo/blob/9296a3712069e688fe64ea02367466122c8e8a3b/demos/common/python/models/open_pose.py#L135 - class OpenPoseDecoder: - BODY_PARTS_KPT_IDS = ( - (1, 2), - (1, 5), - (2, 3), - (3, 4), - (5, 6), - (6, 7), - (1, 8), - (8, 9), - (9, 10), - (1, 11), - (11, 12), - (12, 13), - (1, 0), - (0, 14), - (14, 16), - (0, 15), - (15, 17), - (2, 16), - (5, 17), - ) - BODY_PARTS_PAF_IDS = ( - 12, - 20, - 14, - 16, - 22, - 24, - 0, - 2, - 4, - 6, - 8, - 10, - 28, - 30, - 34, - 32, - 36, - 18, - 26, - ) - - def __init__( - self, - num_joints=18, - skeleton=BODY_PARTS_KPT_IDS, - paf_indices=BODY_PARTS_PAF_IDS, - max_points=100, - score_threshold=0.1, - min_paf_alignment_score=0.05, - delta=0.5, - ): - self.num_joints = num_joints - self.skeleton = skeleton - self.paf_indices = paf_indices - self.max_points = max_points - self.score_threshold = score_threshold - self.min_paf_alignment_score = min_paf_alignment_score - self.delta = delta - - self.points_per_limb = 10 - self.grid = np.arange(self.points_per_limb, dtype=np.float32).reshape(1, -1, 1) - - def __call__(self, heatmaps, nms_heatmaps, pafs): - batch_size, _, h, w = heatmaps.shape - assert batch_size == 1, "Batch size of 1 only supported" - - keypoints = self.extract_points(heatmaps, nms_heatmaps) - pafs = np.transpose(pafs, (0, 2, 3, 1)) - - if self.delta > 0: - for kpts in keypoints: - kpts[:, :2] += self.delta - np.clip(kpts[:, 0], 0, w - 1, out=kpts[:, 0]) - np.clip(kpts[:, 1], 0, h - 1, out=kpts[:, 1]) - - pose_entries, keypoints = self.group_keypoints(keypoints, pafs, pose_entry_size=self.num_joints + 2) - poses, scores = self.convert_to_coco_format(pose_entries, keypoints) - if len(poses) > 0: - poses = np.asarray(poses, dtype=np.float32) - poses = poses.reshape((poses.shape[0], -1, 3)) - else: - poses = np.empty((0, 17, 3), dtype=np.float32) - scores = np.empty(0, dtype=np.float32) - - return poses, scores - - def extract_points(self, heatmaps, nms_heatmaps): - batch_size, channels_num, h, w = heatmaps.shape - assert batch_size == 1, "Batch size of 1 only supported" - assert channels_num >= self.num_joints - - xs, ys, scores = self.top_k(nms_heatmaps) - masks = scores > self.score_threshold - all_keypoints = [] - keypoint_id = 0 - for k in range(self.num_joints): - # Filter low-score points. - mask = masks[0, k] - x = xs[0, k][mask].ravel() - y = ys[0, k][mask].ravel() - score = scores[0, k][mask].ravel() - n = len(x) - if n == 0: - all_keypoints.append(np.empty((0, 4), dtype=np.float32)) - continue - # Apply quarter offset to improve localization accuracy. - x, y = self.refine(heatmaps[0, k], x, y) - np.clip(x, 0, w - 1, out=x) - np.clip(y, 0, h - 1, out=y) - # Pack resulting points. - keypoints = np.empty((n, 4), dtype=np.float32) - keypoints[:, 0] = x - keypoints[:, 1] = y - keypoints[:, 2] = score - keypoints[:, 3] = np.arange(keypoint_id, keypoint_id + n) - keypoint_id += n - all_keypoints.append(keypoints) - return all_keypoints - - def top_k(self, heatmaps): - N, K, _, W = heatmaps.shape - heatmaps = heatmaps.reshape(N, K, -1) - # Get positions with top scores. - ind = heatmaps.argpartition(-self.max_points, axis=2)[:, :, -self.max_points :] - scores = np.take_along_axis(heatmaps, ind, axis=2) - # Keep top scores sorted. - subind = np.argsort(-scores, axis=2) - ind = np.take_along_axis(ind, subind, axis=2) - scores = np.take_along_axis(scores, subind, axis=2) - y, x = np.divmod(ind, W) - return x, y, scores - - @staticmethod - def refine(heatmap, x, y): - h, w = heatmap.shape[-2:] - valid = np.logical_and(np.logical_and(x > 0, x < w - 1), np.logical_and(y > 0, y < h - 1)) - xx = x[valid] - yy = y[valid] - dx = np.sign(heatmap[yy, xx + 1] - heatmap[yy, xx - 1], dtype=np.float32) * 0.25 - dy = np.sign(heatmap[yy + 1, xx] - heatmap[yy - 1, xx], dtype=np.float32) * 0.25 - x = x.astype(np.float32) - y = y.astype(np.float32) - x[valid] += dx - y[valid] += dy - return x, y - - @staticmethod - def is_disjoint(pose_a, pose_b): - pose_a = pose_a[:-2] - pose_b = pose_b[:-2] - return np.all(np.logical_or.reduce((pose_a == pose_b, pose_a < 0, pose_b < 0))) - - def update_poses( - self, - kpt_a_id, - kpt_b_id, - all_keypoints, - connections, - pose_entries, - pose_entry_size, - ): - for connection in connections: - pose_a_idx = -1 - pose_b_idx = -1 - for j, pose in enumerate(pose_entries): - if pose[kpt_a_id] == connection[0]: - pose_a_idx = j - if pose[kpt_b_id] == connection[1]: - pose_b_idx = j - if pose_a_idx < 0 and pose_b_idx < 0: - # Create new pose entry. - pose_entry = np.full(pose_entry_size, -1, dtype=np.float32) - pose_entry[kpt_a_id] = connection[0] - pose_entry[kpt_b_id] = connection[1] - pose_entry[-1] = 2 - pose_entry[-2] = np.sum(all_keypoints[connection[0:2], 2]) + connection[2] - pose_entries.append(pose_entry) - elif pose_a_idx >= 0 and pose_b_idx >= 0 and pose_a_idx != pose_b_idx: - # Merge two poses are disjoint merge them, otherwise ignore connection. - pose_a = pose_entries[pose_a_idx] - pose_b = pose_entries[pose_b_idx] - if self.is_disjoint(pose_a, pose_b): - pose_a += pose_b - pose_a[:-2] += 1 - pose_a[-2] += connection[2] - del pose_entries[pose_b_idx] - elif pose_a_idx >= 0 and pose_b_idx >= 0: - # Adjust score of a pose. - pose_entries[pose_a_idx][-2] += connection[2] - elif pose_a_idx >= 0: - # Add a new limb into pose. - pose = pose_entries[pose_a_idx] - if pose[kpt_b_id] < 0: - pose[-2] += all_keypoints[connection[1], 2] - pose[kpt_b_id] = connection[1] - pose[-2] += connection[2] - pose[-1] += 1 - elif pose_b_idx >= 0: - # Add a new limb into pose. - pose = pose_entries[pose_b_idx] - if pose[kpt_a_id] < 0: - pose[-2] += all_keypoints[connection[0], 2] - pose[kpt_a_id] = connection[0] - pose[-2] += connection[2] - pose[-1] += 1 - return pose_entries - - @staticmethod - def connections_nms(a_idx, b_idx, affinity_scores): - # From all retrieved connections that share starting/ending keypoints leave only the top-scoring ones. - order = affinity_scores.argsort()[::-1] - affinity_scores = affinity_scores[order] - a_idx = a_idx[order] - b_idx = b_idx[order] - idx = [] - has_kpt_a = set() - has_kpt_b = set() - for t, (i, j) in enumerate(zip(a_idx, b_idx)): - if i not in has_kpt_a and j not in has_kpt_b: - idx.append(t) - has_kpt_a.add(i) - has_kpt_b.add(j) - idx = np.asarray(idx, dtype=np.int32) - return a_idx[idx], b_idx[idx], affinity_scores[idx] - - def group_keypoints(self, all_keypoints_by_type, pafs, pose_entry_size=20): - all_keypoints = np.concatenate(all_keypoints_by_type, axis=0) - pose_entries = [] - # For every limb. - for part_id, paf_channel in enumerate(self.paf_indices): - kpt_a_id, kpt_b_id = self.skeleton[part_id] - kpts_a = all_keypoints_by_type[kpt_a_id] - kpts_b = all_keypoints_by_type[kpt_b_id] - n = len(kpts_a) - m = len(kpts_b) - if n == 0 or m == 0: - continue - - # Get vectors between all pairs of keypoints, i.e. candidate limb vectors. - a = kpts_a[:, :2] - a = np.broadcast_to(a[None], (m, n, 2)) - b = kpts_b[:, :2] - vec_raw = (b[:, None, :] - a).reshape(-1, 1, 2) - - # Sample points along every candidate limb vector. - steps = 1 / (self.points_per_limb - 1) * vec_raw - points = steps * self.grid + a.reshape(-1, 1, 2) - points = points.round().astype(dtype=np.int32) - x = points[..., 0].ravel() - y = points[..., 1].ravel() - - # Compute affinity score between candidate limb vectors and part affinity field. - part_pafs = pafs[0, :, :, paf_channel : paf_channel + 2] - field = part_pafs[y, x].reshape(-1, self.points_per_limb, 2) - vec_norm = np.linalg.norm(vec_raw, ord=2, axis=-1, keepdims=True) - vec = vec_raw / (vec_norm + 1e-6) - affinity_scores = (field * vec).sum(-1).reshape(-1, self.points_per_limb) - valid_affinity_scores = affinity_scores > self.min_paf_alignment_score - valid_num = valid_affinity_scores.sum(1) - affinity_scores = (affinity_scores * valid_affinity_scores).sum(1) / (valid_num + 1e-6) - success_ratio = valid_num / self.points_per_limb - - # Get a list of limbs according to the obtained affinity score. - valid_limbs = np.where(np.logical_and(affinity_scores > 0, success_ratio > 0.8))[0] - if len(valid_limbs) == 0: - continue - b_idx, a_idx = np.divmod(valid_limbs, n) - affinity_scores = affinity_scores[valid_limbs] - - # Suppress incompatible connections. - a_idx, b_idx, affinity_scores = self.connections_nms(a_idx, b_idx, affinity_scores) - connections = list( - zip( - kpts_a[a_idx, 3].astype(np.int32), - kpts_b[b_idx, 3].astype(np.int32), - affinity_scores, - ) - ) - if len(connections) == 0: - continue - - # Update poses with new connections. - pose_entries = self.update_poses( - kpt_a_id, - kpt_b_id, - all_keypoints, - connections, - pose_entries, - pose_entry_size, - ) - - # Remove poses with not enough points. - pose_entries = np.asarray(pose_entries, dtype=np.float32).reshape(-1, pose_entry_size) - pose_entries = pose_entries[pose_entries[:, -1] >= 3] - return pose_entries, all_keypoints - - @staticmethod - def convert_to_coco_format(pose_entries, all_keypoints): - num_joints = 17 - coco_keypoints = [] - scores = [] - for pose in pose_entries: - if len(pose) == 0: - continue - keypoints = np.zeros(num_joints * 3) - reorder_map = [0, -1, 6, 8, 10, 5, 7, 9, 12, 14, 16, 11, 13, 15, 2, 1, 4, 3] - person_score = pose[-2] - for keypoint_id, target_id in zip(pose[:-2], reorder_map): - if target_id < 0: - continue - cx, cy, score = 0, 0, 0 # keypoint not found - if keypoint_id != -1: - cx, cy, score = all_keypoints[int(keypoint_id), 0:3] - keypoints[target_id * 3 + 0] = cx - keypoints[target_id * 3 + 1] = cy - keypoints[target_id * 3 + 2] = score - coco_keypoints.append(keypoints) - scores.append(person_score * max(0, (pose[-1] - 1))) # -1 for 'neck' - return np.asarray(coco_keypoints), np.asarray(scores) - -Processing ----------- - - - -.. code:: ipython3 - - decoder = OpenPoseDecoder() - -Process Results -~~~~~~~~~~~~~~~ - - - -A bunch of useful functions to transform results into poses. - -First, pool the heatmap. Since pooling is not available in numpy, use a -simple method to do it directly with numpy. Then, use non-maximum -suppression to get the keypoints from the heatmap. After that, decode -poses by using the decoder. Since the input image is bigger than the -network outputs, you need to multiply all pose coordinates by a scaling -factor. - -.. code:: ipython3 - - # 2D pooling in numpy (from: https://stackoverflow.com/a/54966908/1624463) - def pool2d(A, kernel_size, stride, padding, pool_mode="max"): - """ - 2D Pooling - - Parameters: - A: input 2D array - kernel_size: int, the size of the window - stride: int, the stride of the window - padding: int, implicit zero paddings on both sides of the input - pool_mode: string, 'max' or 'avg' - """ - # Padding - A = np.pad(A, padding, mode="constant") - - # Window view of A - output_shape = ( - (A.shape[0] - kernel_size) // stride + 1, - (A.shape[1] - kernel_size) // stride + 1, - ) - kernel_size = (kernel_size, kernel_size) - A_w = as_strided( - A, - shape=output_shape + kernel_size, - strides=(stride * A.strides[0], stride * A.strides[1]) + A.strides, - ) - A_w = A_w.reshape(-1, *kernel_size) - - # Return the result of pooling. - if pool_mode == "max": - return A_w.max(axis=(1, 2)).reshape(output_shape) - elif pool_mode == "avg": - return A_w.mean(axis=(1, 2)).reshape(output_shape) - - - # non maximum suppression - def heatmap_nms(heatmaps, pooled_heatmaps): - return heatmaps * (heatmaps == pooled_heatmaps) - - - # Get poses from results. - def process_results(img, pafs, heatmaps): - # This processing comes from - # https://github.com/openvinotoolkit/open_model_zoo/blob/master/demos/common/python/models/open_pose.py - pooled_heatmaps = np.array([[pool2d(h, kernel_size=3, stride=1, padding=1, pool_mode="max") for h in heatmaps[0]]]) - nms_heatmaps = heatmap_nms(heatmaps, pooled_heatmaps) - - # Decode poses. - poses, scores = decoder(heatmaps, nms_heatmaps, pafs) - output_shape = list(compiled_model.output(index=0).partial_shape) - output_scale = ( - img.shape[1] / output_shape[3].get_length(), - img.shape[0] / output_shape[2].get_length(), - ) - # Multiply coordinates by a scaling factor. - poses[:, :, :2] *= output_scale - return poses, scores - -Draw Pose Overlays -~~~~~~~~~~~~~~~~~~ - - - -Draw pose overlays on the image to visualize estimated poses. Joints are -drawn as circles and limbs are drawn as lines. The code is based on the -`Human Pose Estimation -Demo `__ -from Open Model Zoo. - -.. code:: ipython3 - - colors = ( - (255, 0, 0), - (255, 0, 255), - (170, 0, 255), - (255, 0, 85), - (255, 0, 170), - (85, 255, 0), - (255, 170, 0), - (0, 255, 0), - (255, 255, 0), - (0, 255, 85), - (170, 255, 0), - (0, 85, 255), - (0, 255, 170), - (0, 0, 255), - (0, 255, 255), - (85, 0, 255), - (0, 170, 255), - ) - - default_skeleton = ( - (15, 13), - (13, 11), - (16, 14), - (14, 12), - (11, 12), - (5, 11), - (6, 12), - (5, 6), - (5, 7), - (6, 8), - (7, 9), - (8, 10), - (1, 2), - (0, 1), - (0, 2), - (1, 3), - (2, 4), - (3, 5), - (4, 6), - ) - - - def draw_poses(img, poses, point_score_threshold, skeleton=default_skeleton): - if poses.size == 0: - return img - - img_limbs = np.copy(img) - for pose in poses: - points = pose[:, :2].astype(np.int32) - points_scores = pose[:, 2] - # Draw joints. - for i, (p, v) in enumerate(zip(points, points_scores)): - if v > point_score_threshold: - cv2.circle(img, tuple(p), 1, colors[i], 2) - # Draw limbs. - for i, j in skeleton: - if points_scores[i] > point_score_threshold and points_scores[j] > point_score_threshold: - cv2.line( - img_limbs, - tuple(points[i]), - tuple(points[j]), - color=colors[j], - thickness=4, - ) - cv2.addWeighted(img, 0.4, img_limbs, 0.6, 0, dst=img) - return img - -Main Processing Function -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Run pose estimation on the specified source. Either a webcam or a video -file. - -.. code:: ipython3 - - # Main processing function to run pose estimation. - def run_pose_estimation(source=0, flip=False, use_popup=False, skip_first_frames=0): - pafs_output_key = compiled_model.output("Mconv7_stage2_L1") - heatmaps_output_key = compiled_model.output("Mconv7_stage2_L2") - player = None - try: - # Create a video player to play with target fps. - player = utils.VideoPlayer(source, flip=flip, fps=30, skip_first_frames=skip_first_frames) - # Start capturing. - player.start() - if use_popup: - title = "Press ESC to Exit" - cv2.namedWindow(title, cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE) - - processing_times = collections.deque() - - while True: - # Grab the frame. - frame = player.next() - if frame is None: - print("Source ended") - break - # If the frame is larger than full HD, reduce size to improve the performance. - scale = 1280 / max(frame.shape) - if scale < 1: - frame = cv2.resize(frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA) - - # Resize the image and change dims to fit neural network input. - # (see https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/intel/human-pose-estimation-0001) - input_img = cv2.resize(frame, (width, height), interpolation=cv2.INTER_AREA) - # Create a batch of images (size = 1). - input_img = input_img.transpose((2, 0, 1))[np.newaxis, ...] - - # Measure processing time. - start_time = time.time() - # Get results. - results = compiled_model([input_img]) - stop_time = time.time() - - pafs = results[pafs_output_key] - heatmaps = results[heatmaps_output_key] - # Get poses from network results. - poses, scores = process_results(frame, pafs, heatmaps) - - # Draw poses on a frame. - frame = draw_poses(frame, poses, 0.1) - - processing_times.append(stop_time - start_time) - # Use processing times from last 200 frames. - if len(processing_times) > 200: - processing_times.popleft() - - _, f_width = frame.shape[:2] - # mean processing time [ms] - processing_time = np.mean(processing_times) * 1000 - fps = 1000 / processing_time - cv2.putText( - frame, - f"Inference time: {processing_time:.1f}ms ({fps:.1f} FPS)", - (20, 40), - cv2.FONT_HERSHEY_COMPLEX, - f_width / 1000, - (0, 0, 255), - 1, - cv2.LINE_AA, - ) - - # Use this workaround if there is flickering. - if use_popup: - cv2.imshow(title, frame) - key = cv2.waitKey(1) - # escape = 27 - if key == 27: - break - else: - # Encode numpy array to jpg. - _, encoded_img = cv2.imencode(".jpg", frame, params=[cv2.IMWRITE_JPEG_QUALITY, 90]) - # Create an IPython image. - i = display.Image(data=encoded_img) - # Display the image in this notebook. - display.clear_output(wait=True) - display.display(i) - # ctrl-c - except KeyboardInterrupt: - print("Interrupted") - # any different error - except RuntimeError as e: - print(e) - finally: - if player is not None: - # Stop capturing. - player.stop() - if use_popup: - cv2.destroyAllWindows() - -Run ---- - - - -Run Live Pose Estimation -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Use a webcam as the video input. By default, the primary webcam is set -with ``source=0``. If you have multiple webcams, each one will be -assigned a consecutive number starting at 0. Set ``flip=True`` when -using a front-facing camera. Some web browsers, especially Mozilla -Firefox, may cause flickering. If you experience flickering, set -``use_popup=True``. - - **NOTE**: To use this notebook with a webcam, you need to run the - notebook on a computer with a webcam. If you run the notebook on a - server (for example, Binder), the webcam will not work. Popup mode - may not work if you run this notebook on a remote computer (for - example, Binder). - -If you do not have a webcam, you can still run this demo with a video -file. Any `format supported by -OpenCV `__ -will work. You can skip first ``N`` frames to fast forward video. - -Run the pose estimation: - -.. code:: ipython3 - - USE_WEBCAM = False - cam_id = 0 - video_file = Path("store-aisle-detection.mp4") - video_url = "https://storage.openvinotoolkit.org/data/test_data/videos/store-aisle-detection.mp4" - source = cam_id if USE_WEBCAM else video_file - - if not USE_WEBCAM and not Path(video_file).exists(): - utils.download_file(video_url) - - additional_options = {"skip_first_frames": 500} if not USE_WEBCAM else {} - run_pose_estimation(source=source, flip=isinstance(source, int), use_popup=False, **additional_options) diff --git a/docs/notebooks/pytorch-onnx-to-openvino-with-output.rst b/docs/notebooks/pytorch-onnx-to-openvino-with-output.rst deleted file mode 100644 index cb3882b145e0db..00000000000000 --- a/docs/notebooks/pytorch-onnx-to-openvino-with-output.rst +++ /dev/null @@ -1,624 +0,0 @@ -Convert a PyTorch Model to ONNX and OpenVINO™ IR -================================================ - -This tutorial demonstrates step-by-step instructions on how to do -inference on a PyTorch semantic segmentation model, using OpenVINO -Runtime. - -First, the PyTorch model is exported in `ONNX `__ -format and then converted to OpenVINO IR. Then the respective ONNX and -OpenVINO IR models are loaded into OpenVINO Runtime to show model -predictions. In this tutorial, we will use LR-ASPP model with -MobileNetV3 backbone. - -According to the paper, `Searching for -MobileNetV3 `__, LR-ASPP or Lite -Reduced Atrous Spatial Pyramid Pooling has a lightweight and efficient -segmentation decoder architecture. The diagram below illustrates the -model architecture: - -.. figure:: https://user-images.githubusercontent.com/29454499/207099169-48dca3dc-a8eb-4e11-be92-40cebeec7a88.png - :alt: image - - image - -The model is pre-trained on the `MS -COCO `__ dataset. Instead of training on -all 80 classes, the segmentation model has been trained on 20 classes -from the `PASCAL VOC `__ -dataset: **background, aeroplane, bicycle, bird, boat, bottle, bus, car, -cat, chair, cow, dining table, dog, horse, motorbike, person, potted -plant, sheep, sofa, train, tv monitor** - -More information about the model is available in the `torchvision -documentation `__ - - -**Table of contents:** - - -- `Preparation <#preparation>`__ - - - `Imports <#imports>`__ - - `Settings <#settings>`__ - - `Load Model <#load-model>`__ - -- `ONNX Model Conversion <#onnx-model-conversion>`__ - - - `Convert PyTorch model to ONNX <#convert-pytorch-model-to-onnx>`__ - - `Convert ONNX Model to OpenVINO IR - Format <#convert-onnx-model-to-openvino-ir-format>`__ - -- `Show Results <#show-results>`__ - - - `Load and Preprocess an Input - Image <#load-and-preprocess-an-input-image>`__ - - `Load the OpenVINO IR Network and Run Inference on the ONNX - model <#load-the-openvino-ir-network-and-run-inference-on-the-onnx-model>`__ - - - `1. ONNX Model in OpenVINO - Runtime <#1--onnx-model-in-openvino-runtime>`__ - - `Select inference device <#select-inference-device>`__ - - `2. OpenVINO IR Model in OpenVINO - Runtime <#2--openvino-ir-model-in-openvino-runtime>`__ - - `Select inference device <#select-inference-device>`__ - -- `PyTorch Comparison <#pytorch-comparison>`__ -- `Performance Comparison <#performance-comparison>`__ -- `References <#references>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - # Install openvino package - %pip install -q "openvino>=2023.1.0" onnx torch torchvision opencv-python tqdm --extra-index-url https://download.pytorch.org/whl/cpu - -Preparation ------------ - - - -Imports -~~~~~~~ - - - -.. code:: ipython3 - - import time - import warnings - from pathlib import Path - - import cv2 - import numpy as np - import openvino as ov - import torch - from torchvision.models.segmentation import ( - lraspp_mobilenet_v3_large, - LRASPP_MobileNet_V3_Large_Weights, - ) - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import segmentation_map_to_image, viz_result_image, SegmentationMap, Label, download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("pytorch-onnx-to-openvino.ipynb") - -Settings -~~~~~~~~ - - - -Set a name for the model, then define width and height of the image that -will be used by the network during inference. According to the input -transforms function, the model is pre-trained on images with a height of -520 and width of 780. - -.. code:: ipython3 - - IMAGE_WIDTH = 780 - IMAGE_HEIGHT = 520 - DIRECTORY_NAME = "model" - BASE_MODEL_NAME = DIRECTORY_NAME + "/lraspp_mobilenet_v3_large" - weights_path = Path(BASE_MODEL_NAME + ".pt") - - # Paths where ONNX and OpenVINO IR models will be stored. - onnx_path = weights_path.with_suffix(".onnx") - if not onnx_path.parent.exists(): - onnx_path.parent.mkdir() - ir_path = onnx_path.with_suffix(".xml") - -Load Model -~~~~~~~~~~ - - - -Generally, PyTorch models represent an instance of ``torch.nn.Module`` -class, initialized by a state dictionary with model weights. Typical -steps for getting a pre-trained model: 1. Create instance of model class -2. Load checkpoint state dict, which contains pre-trained model weights -3. Turn model to evaluation for switching some operations to inference -mode - -The ``torchvision`` module provides a ready to use set of functions for -model class initialization. We will use -``torchvision.models.segmentation.lraspp_mobilenet_v3_large``. You can -directly pass pre-trained model weights to the model initialization -function using weights enum -``LRASPP_MobileNet_V3_Large_Weights.COCO_WITH_VOC_LABELS_V1``. However, -for demonstration purposes, we will create it separately. Download the -pre-trained weights and load the model. This may take some time if you -have not downloaded the model before. - -.. code:: ipython3 - - if not weights_path.exists(): - print("Downloading the LRASPP MobileNetV3 model (if it has not been downloaded already)...") - download_file( - LRASPP_MobileNet_V3_Large_Weights.COCO_WITH_VOC_LABELS_V1.url, - filename=weights_path.name, - directory=weights_path.parent, - ) - # create model object - model = lraspp_mobilenet_v3_large() - # read state dict, use map_location argument to avoid a situation where weights are saved in cuda (which may not be unavailable on the system) - state_dict = torch.load(weights_path, map_location="cpu") - # load state dict to model - model.load_state_dict(state_dict) - # switch model from training to inference mode - model.eval() - print("Loaded PyTorch LRASPP MobileNetV3 model") - - -.. parsed-literal:: - - Downloading the LRASPP MobileNetV3 model (if it has not been downloaded already)... - 'model/lraspp_mobilenet_v3_large.pt' already exists. - Loaded PyTorch LRASPP MobileNetV3 model - - -ONNX Model Conversion ---------------------- - - - -Convert PyTorch model to ONNX -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -OpenVINO supports PyTorch models that are exported in ONNX format. We -will use the ``torch.onnx.export`` function to obtain the ONNX model, -you can learn more about this feature in the `PyTorch -documentation `__. We need to -provide a model object, example input for model tracing and path where -the model will be saved. When providing example input, it is not -necessary to use real data, dummy input data with specified shape is -sufficient. Optionally, we can provide a target onnx opset for -conversion and/or other parameters specified in documentation -(e.g. input and output names or dynamic shapes). - -Sometimes a warning will be shown, but in most cases it is harmless, so -let us just filter it out. When the conversion is successful, the last -line of the output will read: -``ONNX model exported to model/lraspp_mobilenet_v3_large.onnx.`` - -.. code:: ipython3 - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore") - if not onnx_path.exists(): - dummy_input = torch.randn(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH) - torch.onnx.export( - model, - dummy_input, - onnx_path, - ) - print(f"ONNX model exported to {onnx_path}.") - else: - print(f"ONNX model {onnx_path} already exists.") - - -.. parsed-literal:: - - ============= Diagnostic Run torch.onnx.export version 2.0.1+cu117 ============= - verbose: False, log level: Level.ERROR - ======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ======================== - - ONNX model exported to model/lraspp_mobilenet_v3_large.onnx. - - -Convert ONNX Model to OpenVINO IR Format -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To convert the ONNX model to OpenVINO IR with ``FP16`` precision, use -model conversion API. The models are saved inside the current directory. -For more information on how to convert models, see this -`page `__. - -.. code:: ipython3 - - if not ir_path.exists(): - print("Exporting ONNX model to IR... This may take a few minutes.") - ov_model = ov.convert_model(onnx_path) - ov.save_model(ov_model, ir_path) - else: - print(f"IR model {ir_path} already exists.") - - -.. parsed-literal:: - - Exporting ONNX model to IR... This may take a few minutes. - - -Show Results ------------- - - - -Confirm that the segmentation results look as expected by comparing -model predictions on the ONNX, OpenVINO IR and PyTorch models. - -Load and Preprocess an Input Image -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Images need to be normalized before propagating through the network. - -.. code:: ipython3 - - def normalize(image: np.ndarray) -> np.ndarray: - """ - Normalize the image to the given mean and standard deviation - for CityScapes models. - """ - image = image.astype(np.float32) - mean = (0.485, 0.456, 0.406) - std = (0.229, 0.224, 0.225) - image /= 255.0 - image -= mean - image /= std - return image - -.. code:: ipython3 - - # Download the image from the openvino_notebooks storage - image_filename = Path("data") / "coco.jpg" - - if not image_filename.exists(): - download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco.jpg", - directory="data", - ) - - image = cv2.cvtColor(cv2.imread(str(image_filename)), cv2.COLOR_BGR2RGB) - - resized_image = cv2.resize(image, (IMAGE_WIDTH, IMAGE_HEIGHT)) - normalized_image = normalize(resized_image) - - # Convert the resized images to network input shape. - input_image = np.expand_dims(np.transpose(resized_image, (2, 0, 1)), 0) - normalized_input_image = np.expand_dims(np.transpose(normalized_image, (2, 0, 1)), 0) - -Load the OpenVINO IR Network and Run Inference on the ONNX model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -OpenVINO Runtime can load ONNX models directly. First, load the ONNX -model, do inference and show the results. Then, load the model that was -converted to OpenVINO Intermediate Representation (OpenVINO IR) with -OpenVINO Converter and do inference on that model, and show the results -on an image. - -1. ONNX Model in OpenVINO Runtime -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -.. code:: ipython3 - - # Instantiate OpenVINO Core - core = ov.Core() - - # Read model to OpenVINO Runtime - model_onnx = core.read_model(model=onnx_path) - -Select inference device -^^^^^^^^^^^^^^^^^^^^^^^ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - # Load model on device - compiled_model_onnx = core.compile_model(model=model_onnx, device_name=device.value) - - # Run inference on the input image - res_onnx = compiled_model_onnx([normalized_input_image])[0] - -Model predicts probabilities for how well each pixel corresponds to a -specific label. To get the label with highest probability for each -pixel, operation argmax should be applied. After that, color coding can -be applied to each label for more convenient visualization. - -.. code:: ipython3 - - voc_labels = [ - Label(index=0, color=(0, 0, 0), name="background"), - Label(index=1, color=(128, 0, 0), name="aeroplane"), - Label(index=2, color=(0, 128, 0), name="bicycle"), - Label(index=3, color=(128, 128, 0), name="bird"), - Label(index=4, color=(0, 0, 128), name="boat"), - Label(index=5, color=(128, 0, 128), name="bottle"), - Label(index=6, color=(0, 128, 128), name="bus"), - Label(index=7, color=(128, 128, 128), name="car"), - Label(index=8, color=(64, 0, 0), name="cat"), - Label(index=9, color=(192, 0, 0), name="chair"), - Label(index=10, color=(64, 128, 0), name="cow"), - Label(index=11, color=(192, 128, 0), name="dining table"), - Label(index=12, color=(64, 0, 128), name="dog"), - Label(index=13, color=(192, 0, 128), name="horse"), - Label(index=14, color=(64, 128, 128), name="motorbike"), - Label(index=15, color=(192, 128, 128), name="person"), - Label(index=16, color=(0, 64, 0), name="potted plant"), - Label(index=17, color=(128, 64, 0), name="sheep"), - Label(index=18, color=(0, 192, 0), name="sofa"), - Label(index=19, color=(128, 192, 0), name="train"), - Label(index=20, color=(0, 64, 128), name="tv monitor"), - ] - VOCLabels = SegmentationMap(voc_labels) - - # Convert the network result to a segmentation map and display the result. - result_mask_onnx = np.squeeze(np.argmax(res_onnx, axis=1)).astype(np.uint8) - viz_result_image( - image, - segmentation_map_to_image(result_mask_onnx, VOCLabels.get_colormap()), - resize=True, - ) - - - - -.. image:: pytorch-onnx-to-openvino-with-output_files/pytorch-onnx-to-openvino-with-output_22_0.png - - - -2. OpenVINO IR Model in OpenVINO Runtime -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -Select inference device -^^^^^^^^^^^^^^^^^^^^^^^ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - # Load the network in OpenVINO Runtime. - core = ov.Core() - model_ir = core.read_model(model=ir_path) - compiled_model_ir = core.compile_model(model=model_ir, device_name=device.value) - - # Get input and output layers. - output_layer_ir = compiled_model_ir.output(0) - - # Run inference on the input image. - res_ir = compiled_model_ir([normalized_input_image])[output_layer_ir] - -.. code:: ipython3 - - result_mask_ir = np.squeeze(np.argmax(res_ir, axis=1)).astype(np.uint8) - viz_result_image( - image, - segmentation_map_to_image(result=result_mask_ir, colormap=VOCLabels.get_colormap()), - resize=True, - ) - - - - -.. image:: pytorch-onnx-to-openvino-with-output_files/pytorch-onnx-to-openvino-with-output_27_0.png - - - -PyTorch Comparison ------------------- - - - -Do inference on the PyTorch model to verify that the output visually -looks the same as the output on the ONNX/OpenVINO IR models. - -.. code:: ipython3 - - model.eval() - with torch.no_grad(): - result_torch = model(torch.as_tensor(normalized_input_image).float()) - - result_mask_torch = torch.argmax(result_torch["out"], dim=1).squeeze(0).numpy().astype(np.uint8) - viz_result_image( - image, - segmentation_map_to_image(result=result_mask_torch, colormap=VOCLabels.get_colormap()), - resize=True, - ) - - - - -.. image:: pytorch-onnx-to-openvino-with-output_files/pytorch-onnx-to-openvino-with-output_29_0.png - - - -Performance Comparison ----------------------- - - - -Measure the time it takes to do inference on twenty images. This gives -an indication of performance. For more accurate benchmarking, use the -`Benchmark -Tool `__. -Keep in mind that many optimizations are possible to improve the -performance. - -.. code:: ipython3 - - num_images = 100 - - with torch.no_grad(): - start = time.perf_counter() - for _ in range(num_images): - model(torch.as_tensor(input_image).float()) - end = time.perf_counter() - time_torch = end - start - print(f"PyTorch model on CPU: {time_torch/num_images:.3f} seconds per image, " f"FPS: {num_images/time_torch:.2f}") - - compiled_model_onnx = core.compile_model(model=model_onnx, device_name=device.value) - start = time.perf_counter() - for _ in range(num_images): - compiled_model_onnx([normalized_input_image]) - end = time.perf_counter() - time_onnx = end - start - print(f"ONNX model in OpenVINO Runtime/{device.value}: {time_onnx/num_images:.3f} " f"seconds per image, FPS: {num_images/time_onnx:.2f}") - - compiled_model_ir = core.compile_model(model=model_ir, device_name=device.value) - start = time.perf_counter() - for _ in range(num_images): - compiled_model_ir([input_image]) - end = time.perf_counter() - time_ir = end - start - print(f"OpenVINO IR model in OpenVINO Runtime/{device.value}: {time_ir/num_images:.3f} " f"seconds per image, FPS: {num_images/time_ir:.2f}") - - -.. parsed-literal:: - - PyTorch model on CPU: 0.038 seconds per image, FPS: 26.50 - ONNX model in OpenVINO Runtime/CPU: 0.026 seconds per image, FPS: 38.21 - OpenVINO IR model in OpenVINO Runtime/CPU: 0.026 seconds per image, FPS: 38.87 - - -.. parsed-literal:: - - 75 warnings generated. - 75 warnings generated. - 75 warnings generated. - 75 warnings generated. - 75 warnings generated. - 75 warnings generated. - 75 warnings generated. - 75 warnings generated. - - -.. parsed-literal:: - - ONNX model in OpenVINO/GPU: 0.035 seconds per image, FPS: 28.38 - - -.. parsed-literal:: - - 75 warnings generated. - 75 warnings generated. - 75 warnings generated. - 75 warnings generated. - 75 warnings generated. - 75 warnings generated. - 75 warnings generated. - 75 warnings generated. - - -.. parsed-literal:: - - IR model in OpenVINO/GPU: 0.035 seconds per image, FPS: 28.22 - - -**Show Device Information** - -.. code:: ipython3 - - import openvino.properties as props - - - devices = core.available_devices - for device in devices: - device_name = core.get_property(device, props.device.full_name) - print(f"{device}: {device_name}") - - -.. parsed-literal:: - - CPU: Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz - GPU: NVIDIA GeForce RTX 3090 (dGPU) - - -References ----------- - - - -- `Torchvision `__ -- `Pytorch ONNX - Documentation `__ -- `PIP install openvino `__ -- `OpenVINO ONNX - support `__ -- `Model Conversion API - documentation `__ -- `Converting Pytorch - model `__ diff --git a/docs/notebooks/pytorch-onnx-to-openvino-with-output_files/pytorch-onnx-to-openvino-with-output_22_0.png b/docs/notebooks/pytorch-onnx-to-openvino-with-output_files/pytorch-onnx-to-openvino-with-output_22_0.png deleted file mode 100644 index 945e25fb7ae372..00000000000000 --- a/docs/notebooks/pytorch-onnx-to-openvino-with-output_files/pytorch-onnx-to-openvino-with-output_22_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c48d5afdee0d49838332deac5fea09e48ea455371a5bc88fd865b070fcffbc3 -size 465692 diff --git a/docs/notebooks/pytorch-onnx-to-openvino-with-output_files/pytorch-onnx-to-openvino-with-output_27_0.png b/docs/notebooks/pytorch-onnx-to-openvino-with-output_files/pytorch-onnx-to-openvino-with-output_27_0.png deleted file mode 100644 index 15e166b09bbceb..00000000000000 --- a/docs/notebooks/pytorch-onnx-to-openvino-with-output_files/pytorch-onnx-to-openvino-with-output_27_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eb50dbc3d8a370e69c72cedea217cb413f04852f64580e0c7b54aa495b6c5ad2 -size 465695 diff --git a/docs/notebooks/pytorch-onnx-to-openvino-with-output_files/pytorch-onnx-to-openvino-with-output_29_0.png b/docs/notebooks/pytorch-onnx-to-openvino-with-output_files/pytorch-onnx-to-openvino-with-output_29_0.png deleted file mode 100644 index 945e25fb7ae372..00000000000000 --- a/docs/notebooks/pytorch-onnx-to-openvino-with-output_files/pytorch-onnx-to-openvino-with-output_29_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c48d5afdee0d49838332deac5fea09e48ea455371a5bc88fd865b070fcffbc3 -size 465692 diff --git a/docs/notebooks/pytorch-post-training-quantization-nncf-with-output.rst b/docs/notebooks/pytorch-post-training-quantization-nncf-with-output.rst deleted file mode 100644 index 4aa0095ed66af1..00000000000000 --- a/docs/notebooks/pytorch-post-training-quantization-nncf-with-output.rst +++ /dev/null @@ -1,740 +0,0 @@ -Post-Training Quantization of PyTorch models with NNCF -====================================================== - -The goal of this tutorial is to demonstrate how to use the NNCF (Neural -Network Compression Framework) 8-bit quantization in post-training mode -(without the fine-tuning pipeline) to optimize a PyTorch model for the -high-speed inference via OpenVINO™ Toolkit. The optimization process -contains the following steps: - -1. Evaluate the original model. -2. Transform the original model to a quantized one. -3. Export optimized and original models to OpenVINO IR. -4. Compare performance of the obtained ``FP32`` and ``INT8`` models. - -This tutorial uses a ResNet-50 model, pre-trained on Tiny ImageNet, -which contains 100000 images of 200 classes (500 for each class) -downsized to 64×64 colored images. The tutorial will demonstrate that -only a tiny part of the dataset is needed for the post-training -quantization, not demanding the fine-tuning of the model. - - **NOTE**: This notebook requires that a C++ compiler is accessible on - the default binary search path of the OS you are running the - notebook. - - -**Table of contents:** - - -- `Preparations <#preparations>`__ - - - `Imports <#imports>`__ - - `Settings <#settings>`__ - - `Download and Prepare Tiny ImageNet - dataset <#download-and-prepare-tiny-imagenet-dataset>`__ - - `Helpers classes and functions <#helpers-classes-and-functions>`__ - - `Validation function <#validation-function>`__ - - `Create and load original uncompressed - model <#create-and-load-original-uncompressed-model>`__ - - `Create train and validation - DataLoaders <#create-train-and-validation-dataloaders>`__ - -- `Model quantization and - benchmarking <#model-quantization-and-benchmarking>`__ - - - `I. Evaluate the loaded model <#i--evaluate-the-loaded-model>`__ - - `II. Create and initialize - quantization <#ii--create-and-initialize-quantization>`__ - - `III. Convert the models to OpenVINO Intermediate Representation - (OpenVINO - IR) <#iii--convert-the-models-to-openvino-intermediate-representation-openvino-ir>`__ - - `IV. Compare performance of INT8 model and FP32 model in - OpenVINO <#iv--compare-performance-of-int8-model-and-fp32-model-in-openvino>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Preparations ------------- - - - -.. code:: ipython3 - - # Install openvino package - %pip install -q "openvino>=2024.0.0" torch torchvision tqdm --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "nncf>=2.9.0" - -Imports -~~~~~~~ - - - -.. code:: ipython3 - - import os - import time - import zipfile - from pathlib import Path - from typing import List, Tuple - - import nncf - import openvino as ov - - import torch - from torchvision.datasets import ImageFolder - from torchvision.models import resnet50 - import torchvision.transforms as transforms - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("pytorch-post-training-quantization-nncf.ipynb") - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -Settings -~~~~~~~~ - - - -.. code:: ipython3 - - torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - print(f"Using {torch_device} device") - - MODEL_DIR = Path("model") - OUTPUT_DIR = Path("output") - BASE_MODEL_NAME = "resnet50" - IMAGE_SIZE = [64, 64] - - OUTPUT_DIR.mkdir(exist_ok=True) - MODEL_DIR.mkdir(exist_ok=True) - - # Paths where PyTorch and OpenVINO IR models will be stored. - fp32_checkpoint_filename = Path(BASE_MODEL_NAME + "_fp32").with_suffix(".pth") - fp32_ir_path = OUTPUT_DIR / Path(BASE_MODEL_NAME + "_fp32").with_suffix(".xml") - int8_ir_path = OUTPUT_DIR / Path(BASE_MODEL_NAME + "_int8").with_suffix(".xml") - - if not (MODEL_DIR / fp32_checkpoint_filename).exists(): - fp32_pth_url = "https://storage.openvinotoolkit.org/repositories/nncf/openvino_notebook_ckpts/304_resnet50_fp32.pth" - download_file(fp32_pth_url, directory=MODEL_DIR, filename=fp32_checkpoint_filename) - - -.. parsed-literal:: - - Using cpu device - 'model/resnet50_fp32.pth' already exists. - - - - -.. parsed-literal:: - - PosixPath('/home/ea/work/openvino_notebooks/notebooks/pytorch-post-training-quantization-nncf/model/resnet50_fp32.pth') - - - -Download and Prepare Tiny ImageNet dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -- 100k images of shape 3x64x64, -- 200 different classes: snake, spider, cat, truck, grasshopper, gull, - etc. - -.. code:: ipython3 - - def download_tiny_imagenet_200( - output_dir: Path, - url: str = "http://cs231n.stanford.edu/tiny-imagenet-200.zip", - tarname: str = "tiny-imagenet-200.zip", - ): - archive_path = output_dir / tarname - download_file(url, directory=output_dir, filename=tarname) - zip_ref = zipfile.ZipFile(archive_path, "r") - zip_ref.extractall(path=output_dir) - zip_ref.close() - print(f"Successfully downloaded and extracted dataset to: {output_dir}") - - - def create_validation_dir(dataset_dir: Path): - VALID_DIR = dataset_dir / "val" - val_img_dir = VALID_DIR / "images" - - fp = open(VALID_DIR / "val_annotations.txt", "r") - data = fp.readlines() - - val_img_dict = {} - for line in data: - words = line.split("\t") - val_img_dict[words[0]] = words[1] - fp.close() - - for img, folder in val_img_dict.items(): - newpath = val_img_dir / folder - if not newpath.exists(): - os.makedirs(newpath) - if (val_img_dir / img).exists(): - os.rename(val_img_dir / img, newpath / img) - - - DATASET_DIR = OUTPUT_DIR / "tiny-imagenet-200" - if not DATASET_DIR.exists(): - download_tiny_imagenet_200(OUTPUT_DIR) - create_validation_dir(DATASET_DIR) - -Helpers classes and functions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The code below will help to count accuracy and visualize validation -process. - -.. code:: ipython3 - - class AverageMeter(object): - """Computes and stores the average and current value""" - - def __init__(self, name: str, fmt: str = ":f"): - self.name = name - self.fmt = fmt - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val: float, n: int = 1): - self.val = val - self.sum += val * n - self.count += n - self.avg = self.sum / self.count - - def __str__(self): - fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" - return fmtstr.format(**self.__dict__) - - - class ProgressMeter(object): - """Displays the progress of validation process""" - - def __init__(self, num_batches: int, meters: List[AverageMeter], prefix: str = ""): - self.batch_fmtstr = self._get_batch_fmtstr(num_batches) - self.meters = meters - self.prefix = prefix - - def display(self, batch: int): - entries = [self.prefix + self.batch_fmtstr.format(batch)] - entries += [str(meter) for meter in self.meters] - print("\t".join(entries)) - - def _get_batch_fmtstr(self, num_batches: int): - num_digits = len(str(num_batches // 1)) - fmt = "{:" + str(num_digits) + "d}" - return "[" + fmt + "/" + fmt.format(num_batches) + "]" - - - def accuracy(output: torch.Tensor, target: torch.Tensor, topk: Tuple[int] = (1,)): - """Computes the accuracy over the k top predictions for the specified values of k""" - with torch.no_grad(): - maxk = max(topk) - batch_size = target.size(0) - - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - correct = pred.eq(target.view(1, -1).expand_as(pred)) - - res = [] - for k in topk: - correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) - res.append(correct_k.mul_(100.0 / batch_size)) - - return res - -Validation function -~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from typing import Union - from openvino.runtime.ie_api import CompiledModel - - - def validate( - val_loader: torch.utils.data.DataLoader, - model: Union[torch.nn.Module, CompiledModel], - ): - """Compute the metrics using data from val_loader for the model""" - batch_time = AverageMeter("Time", ":3.3f") - top1 = AverageMeter("Acc@1", ":2.2f") - top5 = AverageMeter("Acc@5", ":2.2f") - progress = ProgressMeter(len(val_loader), [batch_time, top1, top5], prefix="Test: ") - start_time = time.time() - # Switch to evaluate mode. - if not isinstance(model, CompiledModel): - model.eval() - model.to(torch_device) - - with torch.no_grad(): - end = time.time() - for i, (images, target) in enumerate(val_loader): - images = images.to(torch_device) - target = target.to(torch_device) - - # Compute the output. - if isinstance(model, CompiledModel): - output_layer = model.output(0) - output = model(images)[output_layer] - output = torch.from_numpy(output) - else: - output = model(images) - - # Measure accuracy and record loss. - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - top1.update(acc1[0], images.size(0)) - top5.update(acc5[0], images.size(0)) - - # Measure elapsed time. - batch_time.update(time.time() - end) - end = time.time() - - print_frequency = 10 - if i % print_frequency == 0: - progress.display(i) - - print(" * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f} Total time: {total_time:.3f}".format(top1=top1, top5=top5, total_time=end - start_time)) - return top1.avg - -Create and load original uncompressed model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -ResNet-50 from the `torchivision -repository `__ is pre-trained on -ImageNet with more prediction classes than Tiny ImageNet, so the model -is adjusted by swapping the last FC layer to one with fewer output -values. - -.. code:: ipython3 - - def create_model(model_path: Path): - """Creates the ResNet-50 model and loads the pretrained weights""" - model = resnet50() - # Update the last FC layer for Tiny ImageNet number of classes. - NUM_CLASSES = 200 - model.fc = torch.nn.Linear(in_features=2048, out_features=NUM_CLASSES, bias=True) - model.to(torch_device) - if model_path.exists(): - checkpoint = torch.load(str(model_path), map_location="cpu") - model.load_state_dict(checkpoint["state_dict"], strict=True) - else: - raise RuntimeError("There is no checkpoint to load") - return model - - - model = create_model(MODEL_DIR / fp32_checkpoint_filename) - -Create train and validation DataLoaders -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - def create_dataloaders(batch_size: int = 128): - """Creates train dataloader that is used for quantization initialization and validation dataloader for computing the model accruacy""" - train_dir = DATASET_DIR / "train" - val_dir = DATASET_DIR / "val" / "images" - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - train_dataset = ImageFolder( - train_dir, - transforms.Compose( - [ - transforms.Resize(IMAGE_SIZE), - transforms.ToTensor(), - normalize, - ] - ), - ) - val_dataset = ImageFolder( - val_dir, - transforms.Compose([transforms.Resize(IMAGE_SIZE), transforms.ToTensor(), normalize]), - ) - - train_loader = torch.utils.data.DataLoader( - train_dataset, - batch_size=batch_size, - shuffle=True, - num_workers=0, - pin_memory=True, - sampler=None, - ) - - val_loader = torch.utils.data.DataLoader( - val_dataset, - batch_size=batch_size, - shuffle=False, - num_workers=0, - pin_memory=True, - ) - return train_loader, val_loader - - - train_loader, val_loader = create_dataloaders() - -Model quantization and benchmarking ------------------------------------ - - - -With the validation pipeline, model files, and data-loading procedures -for model calibration now prepared, it’s time to proceed with the actual -post-training quantization using NNCF. - -I. Evaluate the loaded model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - acc1 = validate(val_loader, model) - print(f"Test accuracy of FP32 model: {acc1:.3f}") - - -.. parsed-literal:: - - Test: [ 0/79] Time 0.317 (0.317) Acc@1 81.25 (81.25) Acc@5 92.19 (92.19) - Test: [10/79] Time 0.391 (0.324) Acc@1 56.25 (66.97) Acc@5 86.72 (87.50) - Test: [20/79] Time 0.284 (0.315) Acc@1 67.97 (64.29) Acc@5 85.16 (87.35) - Test: [30/79] Time 0.291 (0.310) Acc@1 53.12 (62.37) Acc@5 77.34 (85.33) - Test: [40/79] Time 0.278 (0.306) Acc@1 67.19 (60.86) Acc@5 90.62 (84.51) - Test: [50/79] Time 0.269 (0.298) Acc@1 60.16 (60.80) Acc@5 88.28 (84.42) - Test: [60/79] Time 0.242 (0.290) Acc@1 66.41 (60.46) Acc@5 86.72 (83.79) - Test: [70/79] Time 0.264 (0.285) Acc@1 52.34 (60.21) Acc@5 80.47 (83.33) - * Acc@1 60.740 Acc@5 83.960 Total time: 22.199 - Test accuracy of FP32 model: 60.740 - - -II. Create and initialize quantization -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -NNCF enables post-training quantization by adding the quantization -layers into the model graph and then using a subset of the training -dataset to initialize the parameters of these additional quantization -layers. The framework is designed so that modifications to your original -training code are minor. Quantization is the simplest scenario and -requires a few modifications. For more information about NNCF Post -Training Quantization (PTQ) API, refer to the `Basic Quantization Flow -Guide `__. - -1. Create a transformation function that accepts a sample from the - dataset and returns data suitable for model inference. This enables - the creation of an instance of the nncf.Dataset class, which - represents the calibration dataset (based on the training dataset) - necessary for post-training quantization. - -.. code:: ipython3 - - def transform_fn(data_item): - images, _ = data_item - return images - - - calibration_dataset = nncf.Dataset(train_loader, transform_fn) - -2. Create a quantized model from the pre-trained ``FP32`` model and the - calibration dataset. - -.. code:: ipython3 - - quantized_model = nncf.quantize(model, calibration_dataset) - - -.. parsed-literal:: - - 2023-09-12 22:52:13.498264: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2023-09-12 22:52:13.533056: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2023-09-12 22:52:14.234552: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda' - - -.. parsed-literal:: - - INFO:nncf:Collecting tensor statistics |█████ | 1 / 3 - INFO:nncf:Collecting tensor statistics |██████████ | 2 / 3 - INFO:nncf:Collecting tensor statistics |████████████████| 3 / 3 - INFO:nncf:Compiling and loading torch extension: quantized_functions_cpu... - INFO:nncf:Finished loading torch extension: quantized_functions_cpu - INFO:nncf:BatchNorm statistics adaptation |█████ | 1 / 3 - INFO:nncf:BatchNorm statistics adaptation |██████████ | 2 / 3 - INFO:nncf:BatchNorm statistics adaptation |████████████████| 3 / 3 - - -3. Evaluate the new model on the validation set after initialization of - quantization. The accuracy should be close to the accuracy of the - floating-point ``FP32`` model for a simple case like the one being - demonstrated now. - -.. code:: ipython3 - - acc1 = validate(val_loader, quantized_model) - print(f"Accuracy of initialized INT8 model: {acc1:.3f}") - - -.. parsed-literal:: - - Test: [ 0/79] Time 0.469 (0.469) Acc@1 82.03 (82.03) Acc@5 92.97 (92.97) - Test: [10/79] Time 0.463 (0.489) Acc@1 60.16 (67.26) Acc@5 86.72 (87.78) - Test: [20/79] Time 0.487 (0.488) Acc@1 67.19 (64.47) Acc@5 85.94 (87.69) - Test: [30/79] Time 0.456 (0.497) Acc@1 53.12 (62.63) Acc@5 76.56 (85.48) - Test: [40/79] Time 0.416 (0.495) Acc@1 67.97 (61.11) Acc@5 89.84 (84.58) - Test: [50/79] Time 0.515 (0.491) Acc@1 62.50 (60.94) Acc@5 85.94 (84.50) - Test: [60/79] Time 0.534 (0.493) Acc@1 65.62 (60.50) Acc@5 86.72 (83.86) - Test: [70/79] Time 0.517 (0.495) Acc@1 53.91 (60.28) Acc@5 79.69 (83.41) - * Acc@1 60.880 Acc@5 84.050 Total time: 38.752 - Accuracy of initialized INT8 model: 60.880 - - -It should be noted that the inference time for the quantized PyTorch -model is longer than that of the original model, as fake quantizers are -added to the model by NNCF. However, the model’s performance will -significantly improve when it is in the OpenVINO Intermediate -Representation (IR) format. - -III. Convert the models to OpenVINO Intermediate Representation (OpenVINO IR) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To convert the Pytorch models to OpenVINO IR, use Model Conversion -Python API. The models will be saved to the ‘OUTPUT’ directory for later -benchmarking. - -For more information about model conversion, refer to this -`page `__. - -.. code:: ipython3 - - dummy_input = torch.randn(128, 3, *IMAGE_SIZE) - - model_ir = ov.convert_model(model, example_input=dummy_input, input=[-1, 3, *IMAGE_SIZE]) - - ov.save_model(model_ir, fp32_ir_path) - - -.. parsed-literal:: - - WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11. - - -.. parsed-literal:: - - [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - - -.. code:: ipython3 - - quantized_model_ir = ov.convert_model(quantized_model, example_input=dummy_input, input=[-1, 3, *IMAGE_SIZE]) - - ov.save_model(quantized_model_ir, int8_ir_path) - - -.. parsed-literal:: - - /home/ea/work/ov_venv/lib/python3.8/site-packages/nncf/torch/quantization/layers.py:336: TracerWarning: Converting a tensor to a Python number might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - return self._level_low.item() - /home/ea/work/ov_venv/lib/python3.8/site-packages/nncf/torch/quantization/layers.py:344: TracerWarning: Converting a tensor to a Python number might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - return self._level_high.item() - /home/ea/work/ov_venv/lib/python3.8/site-packages/torch/jit/_trace.py:1084: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error: - Tensor-likes are not close! - - Mismatched elements: 23985 / 25600 (93.7%) - Greatest absolute difference: 0.36202991008758545 at index (90, 14) (up to 1e-05 allowed) - Greatest relative difference: 1153.8073089700997 at index (116, 158) (up to 1e-05 allowed) - _check_trace( - - -Select inference device for OpenVINO - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -Evaluate the FP32 and INT8 models. - -.. code:: ipython3 - - core = ov.Core() - fp32_compiled_model = core.compile_model(model_ir, device.value) - acc1 = validate(val_loader, fp32_compiled_model) - print(f"Accuracy of FP32 IR model: {acc1:.3f}") - - -.. parsed-literal:: - - Test: [ 0/79] Time 0.166 (0.166) Acc@1 81.25 (81.25) Acc@5 92.19 (92.19) - Test: [10/79] Time 0.123 (0.128) Acc@1 56.25 (66.97) Acc@5 86.72 (87.50) - Test: [20/79] Time 0.121 (0.126) Acc@1 67.97 (64.29) Acc@5 85.16 (87.35) - Test: [30/79] Time 0.121 (0.125) Acc@1 53.12 (62.37) Acc@5 77.34 (85.33) - Test: [40/79] Time 0.123 (0.125) Acc@1 67.19 (60.86) Acc@5 90.62 (84.51) - Test: [50/79] Time 0.122 (0.125) Acc@1 60.16 (60.80) Acc@5 88.28 (84.42) - Test: [60/79] Time 0.124 (0.125) Acc@1 66.41 (60.46) Acc@5 86.72 (83.79) - Test: [70/79] Time 0.129 (0.125) Acc@1 52.34 (60.21) Acc@5 80.47 (83.33) - * Acc@1 60.740 Acc@5 83.960 Total time: 9.788 - Accuracy of FP32 IR model: 60.740 - - -.. code:: ipython3 - - int8_compiled_model = core.compile_model(quantized_model_ir, device.value) - acc1 = validate(val_loader, int8_compiled_model) - print(f"Accuracy of INT8 IR model: {acc1:.3f}") - - -.. parsed-literal:: - - Test: [ 0/79] Time 0.115 (0.115) Acc@1 82.81 (82.81) Acc@5 92.97 (92.97) - Test: [10/79] Time 0.072 (0.077) Acc@1 60.16 (67.83) Acc@5 86.72 (87.93) - Test: [20/79] Time 0.073 (0.075) Acc@1 69.53 (64.43) Acc@5 83.59 (87.43) - Test: [30/79] Time 0.067 (0.074) Acc@1 53.12 (62.63) Acc@5 75.00 (85.26) - Test: [40/79] Time 0.072 (0.074) Acc@1 66.41 (61.13) Acc@5 89.06 (84.41) - Test: [50/79] Time 0.075 (0.074) Acc@1 60.16 (60.92) Acc@5 88.28 (84.33) - Test: [60/79] Time 0.073 (0.074) Acc@1 66.41 (60.57) Acc@5 87.50 (83.67) - Test: [70/79] Time 0.072 (0.074) Acc@1 53.91 (60.29) Acc@5 80.47 (83.30) - * Acc@1 60.920 Acc@5 83.970 Total time: 5.761 - Accuracy of INT8 IR model: 60.920 - - -IV. Compare performance of INT8 model and FP32 model in OpenVINO -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Finally, measure the inference performance of the ``FP32`` and ``INT8`` -models, using `Benchmark -Tool `__ -- an inference performance measurement tool in OpenVINO. By default, -Benchmark Tool runs inference for 60 seconds in asynchronous mode on -CPU. It returns inference speed as latency (milliseconds per image) and -throughput (frames per second) values. - - **NOTE**: This notebook runs benchmark_app for 15 seconds to give a - quick indication of performance. For more accurate performance, it is - recommended to run benchmark_app in a terminal/command prompt after - closing other applications. Run ``benchmark_app -m model.xml -d CPU`` - to benchmark async inference on CPU for one minute. Change CPU to GPU - to benchmark on GPU. Run ``benchmark_app --help`` to see an overview - of all command-line options. - -.. code:: ipython3 - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - def parse_benchmark_output(benchmark_output: str): - """Prints the output from benchmark_app in human-readable format""" - parsed_output = [line for line in benchmark_output if "FPS" in line] - print(*parsed_output, sep="\n") - - - print("Benchmark FP32 model (OpenVINO IR)") - benchmark_output = ! benchmark_app -m "$fp32_ir_path" -d $device.value -api async -t 15 -shape "[1, 3, 512, 512]" - parse_benchmark_output(benchmark_output) - - print("Benchmark INT8 model (OpenVINO IR)") - benchmark_output = ! benchmark_app -m "$int8_ir_path" -d $device.value -api async -t 15 -shape "[1, 3, 512, 512]" - parse_benchmark_output(benchmark_output) - - print("Benchmark FP32 model (OpenVINO IR) synchronously") - benchmark_output = ! benchmark_app -m "$fp32_ir_path" -d $device.value -api sync -t 15 -shape "[1, 3, 512, 512]" - parse_benchmark_output(benchmark_output) - - print("Benchmark INT8 model (OpenVINO IR) synchronously") - benchmark_output = ! benchmark_app -m "$int8_ir_path" -d $device.value -api sync -t 15 -shape "[1, 3, 512, 512]" - parse_benchmark_output(benchmark_output) - - -.. parsed-literal:: - - Benchmark FP32 model (OpenVINO IR) - [ INFO ] Throughput: 51.11 FPS - Benchmark INT8 model (OpenVINO IR) - [ INFO ] Throughput: 196.85 FPS - Benchmark FP32 model (OpenVINO IR) synchronously - [ INFO ] Throughput: 36.68 FPS - Benchmark INT8 model (OpenVINO IR) synchronously - [ INFO ] Throughput: 128.84 FPS - - -Show device Information for reference: - -.. code:: ipython3 - - import openvino.properties as props - - - core = ov.Core() - devices = core.available_devices - - for device_name in devices: - device_full_name = core.get_property(device_name, props.device.full_name) - print(f"{device_name}: {device_full_name}") - - -.. parsed-literal:: - - CPU: Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz - GPU: NVIDIA GeForce GTX 1080 Ti (dGPU) - diff --git a/docs/notebooks/pytorch-quantization-aware-training-with-output.rst b/docs/notebooks/pytorch-quantization-aware-training-with-output.rst deleted file mode 100644 index 8aafb853a89ae7..00000000000000 --- a/docs/notebooks/pytorch-quantization-aware-training-with-output.rst +++ /dev/null @@ -1,823 +0,0 @@ -Quantization Aware Training with NNCF, using PyTorch framework -============================================================== - -This notebook is based on `ImageNet training in -PyTorch `__. - -The goal of this notebook is to demonstrate how to use the Neural -Network Compression Framework -`NNCF `__ 8-bit quantization to -optimize a PyTorch model for inference with OpenVINO Toolkit. The -optimization process contains the following steps: - -- Transforming the original ``FP32`` model to ``INT8`` -- Using fine-tuning to improve the accuracy. -- Exporting optimized and original models to OpenVINO IR -- Measuring and comparing the performance of models. - -For more advanced usage, refer to these -`examples `__. - -This tutorial uses the ResNet-18 model with the Tiny ImageNet-200 -dataset. ResNet-18 is the version of ResNet models that contains the -fewest layers (18). Tiny ImageNet-200 is a subset of the larger ImageNet -dataset with smaller images. The dataset will be downloaded in the -notebook. Using the smaller model and dataset will speed up training and -download time. To see other ResNet models, visit `PyTorch -hub `__. - - **NOTE**: This notebook requires a C++ compiler for compiling PyTorch - custom operations for quantization. For Windows we recommend to - install Visual Studio with C++ support, you can find instruction - `here `__. - For MacOS ``xcode-select --install`` command installs many developer - tools, including C++. For Linux you can install gcc with your - distribution’s package manager. - - -**Table of contents:** - - -- `Imports and Settings <#imports-and-settings>`__ -- `Pre-train Floating-Point Model <#pre-train-floating-point-model>`__ - - - `Train Function <#train-function>`__ - - `Validate Function <#validate-function>`__ - - `Helpers <#helpers>`__ - - `Get a Pre-trained FP32 Model <#get-a-pre-trained-fp32-model>`__ - -- `Create and Initialize - Quantization <#create-and-initialize-quantization>`__ -- `Fine-tune the Compressed Model <#fine-tune-the-compressed-model>`__ -- `Export INT8 Model to OpenVINO - IR <#export-int8-model-to-openvino-ir>`__ -- `Benchmark Model Performance by Computing Inference - Time <#benchmark-model-performance-by-computing-inference-time>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "openvino>=2024.0.0" "torch" "torchvision" "tqdm" - %pip install -q "nncf>=2.9.0" - - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - - -Imports and Settings --------------------- - - - -On Windows, add the required C++ directories to the system PATH. - -Import NNCF and all auxiliary packages from your Python code. Set a name -for the model, and the image width and height that will be used for the -network. Also define paths where PyTorch and OpenVINO IR versions of the -models will be stored. - - **NOTE**: All NNCF logging messages below ERROR level (INFO and - WARNING) are disabled to simplify the tutorial. For production use, - it is recommended to enable logging by removing - ``set_log_level(logging.ERROR)``. - -.. code:: ipython3 - - import time - import warnings # To disable warnings on export model - import zipfile - from pathlib import Path - - import torch - - import torch.nn as nn - import torch.nn.parallel - import torch.optim - import torch.utils.data - import torch.utils.data.distributed - import torchvision.datasets as datasets - import torchvision.models as models - import torchvision.transforms as transforms - - import openvino as ov - from torch.jit import TracerWarning - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file, device_widget - - torch.manual_seed(0) - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - print(f"Using {device} device") - - MODEL_DIR = Path("model") - OUTPUT_DIR = Path("output") - DATA_DIR = Path("data") - BASE_MODEL_NAME = "resnet18" - image_size = 64 - - OUTPUT_DIR.mkdir(exist_ok=True) - MODEL_DIR.mkdir(exist_ok=True) - DATA_DIR.mkdir(exist_ok=True) - - # Paths where PyTorch and OpenVINO IR models will be stored. - fp32_pth_path = Path(MODEL_DIR / (BASE_MODEL_NAME + "_fp32")).with_suffix(".pth") - fp32_ir_path = fp32_pth_path.with_suffix(".xml") - int8_ir_path = Path(MODEL_DIR / (BASE_MODEL_NAME + "_int8")).with_suffix(".xml") - - # It is possible to train FP32 model from scratch, but it might be slow. Therefore, the pre-trained weights are downloaded by default. - pretrained_on_tiny_imagenet = True - fp32_pth_url = "https://storage.openvinotoolkit.org/repositories/nncf/openvino_notebook_ckpts/302_resnet18_fp32_v1.pth" - if not (MODEL_DIR / fp32_pth_path).exists(): - download_file(fp32_pth_url, directory=MODEL_DIR, filename=fp32_pth_path.name) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("pytorch-quantization-aware-training.ipynb") - - -.. parsed-literal:: - - Using cuda device - - - -.. parsed-literal:: - - model/resnet18_fp32.pth: 0%| | 0.00/43.1M [00:00 best_acc1 - best_acc1 = max(acc1, best_acc1) - - if is_best: - checkpoint = {"state_dict": model.state_dict(), "acc1": acc1} - torch.save(checkpoint, fp32_pth_path) - acc1_fp32 = best_acc1 - - print(f"Accuracy of FP32 model: {acc1_fp32:.3f}") - - -.. parsed-literal:: - - Accuracy of FP32 model: 55.520 - - -Export the ``FP32`` model to OpenVINO™ Intermediate Representation, to -benchmark it in comparison with the ``INT8`` model. - -.. code:: ipython3 - - dummy_input = torch.randn(1, 3, image_size, image_size).to(device) - - ov_model = ov.convert_model(model, example_input=dummy_input, input=[1, 3, image_size, image_size]) - ov.save_model(ov_model, fp32_ir_path, compress_to_fp16=False) - print(f"FP32 model was exported to {fp32_ir_path}.") - - -.. parsed-literal:: - - FP32 model was exported to model/resnet18_fp32.xml. - - -Create and Initialize Quantization ----------------------------------- - - - -NNCF enables compression-aware training by integrating into regular -training pipelines. The framework is designed so that modifications to -your original training code are minor. Quantization requires only 2 -modifications. - -1. Create a quantization data loader with batch size equal to one and - wrap it by the ``nncf.Dataset``, specifying a transformation function - which prepares input data to fit into model during quantization. In - our case, to pick input tensor from pair (input tensor and label). - -.. code:: ipython3 - - import nncf - - - def transform_fn(data_item): - return data_item[0] - - - # Creating separate dataloader with batch size = 1 - # as dataloaders with batches > 1 is not supported yet. - quantization_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=0, pin_memory=True) - - quantization_dataset = nncf.Dataset(quantization_loader, transform_fn) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -2. Run ``nncf.quantize`` for Getting an Optimized Model. - -``nncf.quantize`` function accepts model and prepared quantization -dataset for performing basic quantization. Optionally, additional -parameters like ``subset_size``, ``preset``, ``ignored_scope`` can be -provided to improve quantization result if applicable. More details -about supported parameters can be found on this -`page `__ - -.. code:: ipython3 - - quantized_model = nncf.quantize(model, quantization_dataset) - - -.. parsed-literal:: - - 2024-01-17 15:43:43.543878: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-01-17 15:43:43.579576: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-01-17 15:43:44.170538: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - - -.. parsed-literal:: - - WARNING:nncf:NNCF provides best results with torch==2.1.0, while current torch version is 1.13.0+cu117. If you encounter issues, consider switching to torch==2.1.0 - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - INFO:nncf:Compiling and loading torch extension: quantized_functions_cuda... - INFO:nncf:Finished loading torch extension: quantized_functions_cuda - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -Evaluate the new model on the validation set after initialization of -quantization. The accuracy should be close to the accuracy of the -floating-point ``FP32`` model for a simple case like the one being -demonstrated here. - -.. code:: ipython3 - - acc1 = validate(val_loader, quantized_model, criterion) - print(f"Accuracy of initialized INT8 model: {acc1:.3f}") - - -.. parsed-literal:: - - Test: [ 0/79] Time 0.110 (0.110) Loss 0.992 (0.992) Acc@1 78.12 (78.12) Acc@5 89.06 (89.06) - Test: [10/79] Time 0.069 (0.074) Loss 1.990 (1.623) Acc@1 44.53 (60.37) Acc@5 79.69 (83.95) - Test: [20/79] Time 0.068 (0.072) Loss 1.814 (1.704) Acc@1 60.16 (58.26) Acc@5 80.47 (82.63) - Test: [30/79] Time 0.068 (0.071) Loss 2.284 (1.794) Acc@1 52.34 (56.75) Acc@5 67.97 (80.90) - Test: [40/79] Time 0.070 (0.072) Loss 1.618 (1.831) Acc@1 61.72 (55.64) Acc@5 82.03 (80.37) - Test: [50/79] Time 0.069 (0.071) Loss 1.951 (1.832) Acc@1 57.81 (55.70) Acc@5 75.00 (80.06) - Test: [60/79] Time 0.070 (0.071) Loss 1.795 (1.855) Acc@1 56.25 (55.28) Acc@5 84.38 (79.75) - Test: [70/79] Time 0.069 (0.071) Loss 2.359 (1.888) Acc@1 47.66 (54.79) Acc@5 74.22 (79.08) - * Acc@1 55.130 Acc@5 79.680 - Accuracy of initialized INT8 model: 55.130 - - -Fine-tune the Compressed Model ------------------------------- - - - -At this step, a regular fine-tuning process is applied to further -improve quantized model accuracy. Normally, several epochs of tuning are -required with a small learning rate, the same that is usually used at -the end of the training of the original model. No other changes in the -training pipeline are required. Here is a simple example. - -.. code:: ipython3 - - compression_lr = init_lr / 10 - optimizer = torch.optim.Adam(quantized_model.parameters(), lr=compression_lr) - - # Train for one epoch with NNCF. - train(train_loader, quantized_model, criterion, optimizer, epoch=0) - - # Evaluate on validation set after Quantization-Aware Training (QAT case). - acc1_int8 = validate(val_loader, quantized_model, criterion) - - print(f"Accuracy of tuned INT8 model: {acc1_int8:.3f}") - print(f"Accuracy drop of tuned INT8 model over pre-trained FP32 model: {acc1_fp32 - acc1_int8:.3f}") - - -.. parsed-literal:: - - Epoch:[0][ 0/782] Time 0.284 (0.284) Loss 0.876 (0.876) Acc@1 78.12 (78.12) Acc@5 92.97 (92.97) - Epoch:[0][ 50/782] Time 0.112 (0.116) Loss 0.796 (0.808) Acc@1 80.47 (79.96) Acc@5 94.53 (94.27) - Epoch:[0][100/782] Time 0.111 (0.114) Loss 0.785 (0.788) Acc@1 82.81 (80.52) Acc@5 92.19 (94.56) - Epoch:[0][150/782] Time 0.114 (0.113) Loss 0.653 (0.785) Acc@1 84.38 (80.69) Acc@5 95.31 (94.45) - Epoch:[0][200/782] Time 0.109 (0.113) Loss 0.804 (0.780) Acc@1 80.47 (80.92) Acc@5 94.53 (94.45) - Epoch:[0][250/782] Time 0.111 (0.113) Loss 0.756 (0.777) Acc@1 83.59 (80.98) Acc@5 94.53 (94.47) - Epoch:[0][300/782] Time 0.112 (0.112) Loss 0.665 (0.772) Acc@1 82.03 (81.07) Acc@5 96.88 (94.53) - Epoch:[0][350/782] Time 0.115 (0.112) Loss 0.661 (0.767) Acc@1 82.81 (81.14) Acc@5 97.66 (94.57) - Epoch:[0][400/782] Time 0.111 (0.113) Loss 0.661 (0.764) Acc@1 78.91 (81.24) Acc@5 96.09 (94.60) - Epoch:[0][450/782] Time 0.119 (0.113) Loss 0.904 (0.762) Acc@1 79.69 (81.27) Acc@5 89.06 (94.60) - Epoch:[0][500/782] Time 0.113 (0.113) Loss 0.609 (0.757) Acc@1 84.38 (81.46) Acc@5 96.88 (94.62) - Epoch:[0][550/782] Time 0.112 (0.113) Loss 0.833 (0.753) Acc@1 76.56 (81.59) Acc@5 95.31 (94.69) - Epoch:[0][600/782] Time 0.112 (0.113) Loss 0.768 (0.751) Acc@1 82.81 (81.63) Acc@5 95.31 (94.69) - Epoch:[0][650/782] Time 0.112 (0.113) Loss 0.750 (0.751) Acc@1 82.81 (81.61) Acc@5 93.75 (94.71) - Epoch:[0][700/782] Time 0.110 (0.113) Loss 0.654 (0.749) Acc@1 84.38 (81.62) Acc@5 96.09 (94.71) - Epoch:[0][750/782] Time 0.110 (0.113) Loss 0.575 (0.748) Acc@1 86.72 (81.67) Acc@5 97.66 (94.73) - Test: [ 0/79] Time 0.070 (0.070) Loss 1.028 (1.028) Acc@1 78.91 (78.91) Acc@5 86.72 (86.72) - Test: [10/79] Time 0.070 (0.070) Loss 1.827 (1.514) Acc@1 46.88 (63.35) Acc@5 79.69 (84.02) - Test: [20/79] Time 0.073 (0.070) Loss 1.628 (1.594) Acc@1 64.06 (60.97) Acc@5 82.03 (83.78) - Test: [30/79] Time 0.069 (0.070) Loss 2.061 (1.688) Acc@1 57.03 (59.25) Acc@5 71.88 (82.26) - Test: [40/79] Time 0.070 (0.070) Loss 1.495 (1.738) Acc@1 66.41 (57.93) Acc@5 85.16 (81.59) - Test: [50/79] Time 0.069 (0.070) Loss 1.863 (1.741) Acc@1 58.59 (57.83) Acc@5 76.56 (81.31) - Test: [60/79] Time 0.069 (0.070) Loss 1.571 (1.779) Acc@1 65.62 (57.21) Acc@5 84.38 (80.74) - Test: [70/79] Time 0.069 (0.070) Loss 2.505 (1.809) Acc@1 46.09 (56.78) Acc@5 75.00 (80.22) - * Acc@1 57.200 Acc@5 80.880 - Accuracy of tuned INT8 model: 57.200 - Accuracy drop of tuned INT8 model over pre-trained FP32 model: -1.680 - - -Export INT8 Model to OpenVINO IR --------------------------------- - - - -.. code:: ipython3 - - if not int8_ir_path.exists(): - warnings.filterwarnings("ignore", category=TracerWarning) - warnings.filterwarnings("ignore", category=UserWarning) - # Export INT8 model to OpenVINO™ IR - ov_model = ov.convert_model(quantized_model, example_input=dummy_input, input=[1, 3, image_size, image_size]) - ov.save_model(ov_model, int8_ir_path) - print(f"INT8 model exported to {int8_ir_path}.") - - -.. parsed-literal:: - - WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11. - INT8 Omodel exported to model/resnet18_int8.xml. - - -Benchmark Model Performance by Computing Inference Time -------------------------------------------------------- - - - -Finally, measure the inference performance of the ``FP32`` and ``INT8`` -models, using `Benchmark -Tool `__ -- inference performance measurement tool in OpenVINO. By default, -Benchmark Tool runs inference for 60 seconds in asynchronous mode on -CPU. It returns inference speed as latency (milliseconds per image) and -throughput (frames per second) values. - - **NOTE**: This notebook runs ``benchmark_app`` for 15 seconds to give - a quick indication of performance. For more accurate performance, it - is recommended to run ``benchmark_app`` in a terminal/command prompt - after closing other applications. Run - ``benchmark_app -m model.xml -d CPU`` to benchmark async inference on - CPU for one minute. Change CPU to GPU to benchmark on GPU. Run - ``benchmark_app --help`` to see an overview of all command-line - options. - -.. code:: ipython3 - - device = device_widget() - - device - -.. code:: ipython3 - - def parse_benchmark_output(benchmark_output): - parsed_output = [line for line in benchmark_output if "FPS" in line] - print(*parsed_output, sep="\n") - - - print("Benchmark FP32 model (IR)") - benchmark_output = ! benchmark_app -m $fp32_ir_path -d $device.value -api async -t 15 - parse_benchmark_output(benchmark_output) - - print("Benchmark INT8 model (IR)") - benchmark_output = ! benchmark_app -m $int8_ir_path -d $device.value -api async -t 15 - parse_benchmark_output(benchmark_output) - - -.. parsed-literal:: - - Benchmark FP32 model (IR) - [ INFO ] Throughput: 3755.92 FPS - Benchmark INT8 model (IR) - [ INFO ] Throughput: 15141.53 FPS - - -Show Device Information for reference. - -.. code:: ipython3 - - import openvino.properties as props - - - core = ov.Core() - core.get_property(device.value, props.device.full_name) - - - - -.. parsed-literal:: - - 'Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz' - - diff --git a/docs/notebooks/pytorch-quantization-sparsity-aware-training-with-output.rst b/docs/notebooks/pytorch-quantization-sparsity-aware-training-with-output.rst deleted file mode 100644 index eedb2fc21e7edb..00000000000000 --- a/docs/notebooks/pytorch-quantization-sparsity-aware-training-with-output.rst +++ /dev/null @@ -1,536 +0,0 @@ -Quantization-Sparsity Aware Training with NNCF, using PyTorch framework -======================================================================= - -This notebook is based on `ImageNet training in -PyTorch `__. - -The goal of this notebook is to demonstrate how to use the Neural -Network Compression Framework -`NNCF `__ 8-bit quantization to -optimize a PyTorch model for inference with OpenVINO Toolkit. The -optimization process contains the following steps: - -- Transforming the original dense ``FP32`` model to sparse ``INT8`` -- Using fine-tuning to improve the accuracy. -- Exporting optimized and original models to OpenVINO IR -- Measuring and comparing the performance of models. - -For more advanced usage, refer to these -`examples `__. - -This tutorial uses the ResNet-50 model with the ImageNet dataset. The -dataset must be downloaded separately. To see ResNet models, visit -`PyTorch hub `__. - - -**Table of contents:** - - -- `Imports and Settings <#imports-and-settings>`__ -- `Pre-train Floating-Point Model <#pre-train-floating-point-model>`__ - - - `Train Function <#train-function>`__ - - `Validate Function <#validate-function>`__ - - `Helpers <#helpers>`__ - - `Get a Pre-trained FP32 Model <#get-a-pre-trained-fp32-model>`__ - -- `Create and Initialize - Quantization <#create-and-initialize-quantization>`__ -- `Fine-tune the Compressed Model <#fine-tune-the-compressed-model>`__ -- `Export INT8 Sparse Model to OpenVINO - IR <#export-int8-model-to-openvino-ir>`__ -- `Benchmark Model Performance by Computing Inference - Time <#benchmark-model-performance-by-computing-inference-time>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "openvino>=2024.0.0" "torch" "torchvision" "tqdm" - %pip install -q "nncf>=2.9.0" - -Imports and Settings --------------------- - - - -On Windows, add the required C++ directories to the system PATH. - -Import NNCF and all auxiliary packages from your Python code. Set a name -for the model, and the image width and height that will be used for the -network. Also define paths where PyTorch and OpenVINO IR versions of the -models will be stored. - - **NOTE**: All NNCF logging messages below ERROR level (INFO and - WARNING) are disabled to simplify the tutorial. For production use, - it is recommended to enable logging by removing - ``set_log_level(logging.ERROR)``. - -.. code:: ipython3 - - import time - import warnings # To disable warnings on export model - from pathlib import Path - - import torch - - import torch.nn as nn - import torch.nn.parallel - import torch.optim - import torch.utils.data - import torch.utils.data.distributed - import torchvision.datasets as datasets - import torchvision.transforms as transforms - import torchvision.models as models - - import openvino as ov - from torch.jit import TracerWarning - - torch.manual_seed(0) - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - print(f"Using {device} device") - - MODEL_DIR = Path("model") - OUTPUT_DIR = Path("output") - # DATA_DIR = Path("...") # Insert path to folder containing imagenet folder - # DATASET_DIR = DATA_DIR / "imagenet" - -.. code:: ipython3 - - # Fetch `notebook_utils` module - import zipfile - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file, device_widget - - DATA_DIR = Path("data") - - - def download_tiny_imagenet_200( - data_dir: Path, - url="http://cs231n.stanford.edu/tiny-imagenet-200.zip", - tarname="tiny-imagenet-200.zip", - ): - archive_path = data_dir / tarname - download_file(url, directory=data_dir, filename=tarname) - zip_ref = zipfile.ZipFile(archive_path, "r") - zip_ref.extractall(path=data_dir) - zip_ref.close() - - - def prepare_tiny_imagenet_200(dataset_dir: Path): - # Format validation set the same way as train set is formatted. - val_data_dir = dataset_dir / "val" - val_annotations_file = val_data_dir / "val_annotations.txt" - with open(val_annotations_file, "r") as f: - val_annotation_data = map(lambda line: line.split("\t")[:2], f.readlines()) - val_images_dir = val_data_dir / "images" - for image_filename, image_label in val_annotation_data: - from_image_filepath = val_images_dir / image_filename - to_image_dir = val_data_dir / image_label - if not to_image_dir.exists(): - to_image_dir.mkdir() - to_image_filepath = to_image_dir / image_filename - from_image_filepath.rename(to_image_filepath) - val_annotations_file.unlink() - val_images_dir.rmdir() - - - DATASET_DIR = DATA_DIR / "tiny-imagenet-200" - if not DATASET_DIR.exists(): - download_tiny_imagenet_200(DATA_DIR) - prepare_tiny_imagenet_200(DATASET_DIR) - print(f"Successfully downloaded and prepared dataset at: {DATASET_DIR}") - - BASE_MODEL_NAME = "resnet18" - image_size = 64 - - OUTPUT_DIR.mkdir(exist_ok=True) - MODEL_DIR.mkdir(exist_ok=True) - DATA_DIR.mkdir(exist_ok=True) - - # Paths where PyTorch and OpenVINO IR models will be stored. - fp32_pth_path = Path(MODEL_DIR / (BASE_MODEL_NAME + "_fp32")).with_suffix(".pth") - fp32_ir_path = fp32_pth_path.with_suffix(".xml") - int8_sparse_ir_path = Path(MODEL_DIR / (BASE_MODEL_NAME + "_int8_sparse")).with_suffix(".xml") - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("pytorch-quantization-sparsity-aware-training.ipynb") - -Train Function -~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - def train(train_loader, model, compression_ctrl, criterion, optimizer, epoch): - batch_time = AverageMeter("Time", ":3.3f") - losses = AverageMeter("Loss", ":2.3f") - top1 = AverageMeter("Acc@1", ":2.2f") - top5 = AverageMeter("Acc@5", ":2.2f") - progress = ProgressMeter( - len(train_loader), - [batch_time, losses, top1, top5], - prefix="Epoch:[{}]".format(epoch), - ) - - # Switch to train mode. - model.train() - - end = time.time() - for i, (images, target) in enumerate(train_loader): - images = images.to(device) - target = target.to(device) - - # Compute output. - output = model(images) - loss = criterion(output, target) - - # Measure accuracy and record loss. - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - losses.update(loss.item(), images.size(0)) - top1.update(acc1[0], images.size(0)) - top5.update(acc5[0], images.size(0)) - - # Compute gradient and do opt step. - optimizer.zero_grad() - loss.backward() - optimizer.step() - - # Measure elapsed time. - batch_time.update(time.time() - end) - end = time.time() - - print_frequency = 50 - if i % print_frequency == 0: - progress.display(i) - compression_ctrl.scheduler.step() - -Validate Function -~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - def validate(val_loader, model, criterion): - batch_time = AverageMeter("Time", ":3.3f") - losses = AverageMeter("Loss", ":2.3f") - top1 = AverageMeter("Acc@1", ":2.2f") - top5 = AverageMeter("Acc@5", ":2.2f") - progress = ProgressMeter(len(val_loader), [batch_time, losses, top1, top5], prefix="Test: ") - - # Switch to evaluate mode. - model.eval() - - with torch.no_grad(): - end = time.time() - for i, (images, target) in enumerate(val_loader): - images = images.to(device) - target = target.to(device) - - # Compute output. - output = model(images) - loss = criterion(output, target) - - # Measure accuracy and record loss. - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - losses.update(loss.item(), images.size(0)) - top1.update(acc1[0], images.size(0)) - top5.update(acc5[0], images.size(0)) - - # Measure elapsed time. - batch_time.update(time.time() - end) - end = time.time() - - print_frequency = 10 - if i % print_frequency == 0: - progress.display(i) - - print(" * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}".format(top1=top1, top5=top5)) - return top1.avg - -Helpers -~~~~~~~ - - - -.. code:: ipython3 - - class AverageMeter(object): - """Computes and stores the average and current value""" - - def __init__(self, name, fmt=":f"): - self.name = name - self.fmt = fmt - self.reset() - - def reset(self): - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val, n=1): - self.val = val - self.sum += val * n - self.count += n - self.avg = self.sum / self.count - - def __str__(self): - fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" - return fmtstr.format(**self.__dict__) - - - class ProgressMeter(object): - def __init__(self, num_batches, meters, prefix=""): - self.batch_fmtstr = self._get_batch_fmtstr(num_batches) - self.meters = meters - self.prefix = prefix - - def display(self, batch): - entries = [self.prefix + self.batch_fmtstr.format(batch)] - entries += [str(meter) for meter in self.meters] - print("\t".join(entries)) - - def _get_batch_fmtstr(self, num_batches): - num_digits = len(str(num_batches // 1)) - fmt = "{:" + str(num_digits) + "d}" - return "[" + fmt + "/" + fmt.format(num_batches) + "]" - - - def accuracy(output, target, topk=(1,)): - """Computes the accuracy over the k top predictions for the specified values of k""" - with torch.no_grad(): - maxk = max(topk) - batch_size = target.size(0) - - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - correct = pred.eq(target.view(1, -1).expand_as(pred)) - - res = [] - for k in topk: - correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) - res.append(correct_k.mul_(100.0 / batch_size)) - return res - -Get a Pre-trained FP32 Model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -А pre-trained floating-point model is a prerequisite for quantization. -It can be obtained by tuning from scratch with the code below. - -.. code:: ipython3 - - num_classes = 1000 - init_lr = 1e-4 - batch_size = 128 - epochs = 20 - - # model = models.resnet50(pretrained=True) - model = models.resnet18(pretrained=True) - model.fc = nn.Linear(in_features=512, out_features=200, bias=True) - model.to(device) - - - # Data loading code. - train_dir = DATASET_DIR / "train" - val_dir = DATASET_DIR / "val" - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - - train_dataset = datasets.ImageFolder( - train_dir, - transforms.Compose( - [ - transforms.Resize([image_size, image_size]), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, - ] - ), - ) - val_dataset = datasets.ImageFolder( - val_dir, - transforms.Compose( - [ - transforms.Resize([256, 256]), - transforms.CenterCrop([image_size, image_size]), - transforms.ToTensor(), - normalize, - ] - ), - ) - - train_loader = torch.utils.data.DataLoader( - train_dataset, - batch_size=batch_size, - shuffle=True, - num_workers=1, - pin_memory=True, - sampler=None, - ) - - val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=1, pin_memory=True) - - # Define loss function (criterion) and optimizer. - criterion = nn.CrossEntropyLoss().to(device) - optimizer = torch.optim.Adam(model.parameters(), lr=init_lr) - -Export the ``FP32`` model to OpenVINO™ Intermediate Representation, to -benchmark it in comparison with the ``INT8`` model. - -.. code:: ipython3 - - dummy_input = torch.randn(1, 3, image_size, image_size).to(device) - - ov_model = ov.convert_model(model, example_input=dummy_input, input=[1, 3, image_size, image_size]) - ov.save_model(ov_model, fp32_ir_path, compress_to_fp16=False) - print(f"FP32 model was exported to {fp32_ir_path}.") - -Create and Initialize Quantization and Sparsity Training --------------------------------------------------------- - - - -NNCF enables compression-aware training by integrating into regular -training pipelines. The framework is designed so that modifications to -your original training code are minor. - -.. code:: ipython3 - - from nncf import NNCFConfig - from nncf.torch import create_compressed_model, register_default_init_args - - # load - nncf_config = NNCFConfig.from_json("config.json") - nncf_config = register_default_init_args(nncf_config, train_loader) - - # Creating a compressed model - compression_ctrl, compressed_model = create_compressed_model(model, nncf_config) - compression_ctrl.scheduler.epoch_step() - -Validate Compressed Model - -Evaluate the new model on the validation set after initialization of -quantization and sparsity. - -.. code:: ipython3 - - acc1 = validate(val_loader, compressed_model, criterion) - print(f"Accuracy of initialized sparse INT8 model: {acc1:.3f}") - -Fine-tune the Compressed Model ------------------------------- - - - -At this step, a regular fine-tuning process is applied to further -improve quantized model accuracy. Normally, several epochs of tuning are -required with a small learning rate, the same that is usually used at -the end of the training of the original model. No other changes in the -training pipeline are required. Here is a simple example. - -.. code:: ipython3 - - compression_lr = init_lr / 10 - optimizer = torch.optim.Adam(compressed_model.parameters(), lr=compression_lr) - nr_epochs = 10 - # Train for one epoch with NNCF. - print("Training") - for epoch in range(nr_epochs): - compression_ctrl.scheduler.epoch_step() - train(train_loader, compressed_model, compression_ctrl, criterion, optimizer, epoch=epoch) - - # Evaluate on validation set after Quantization-Aware Training (QAT case). - print("Validating") - acc1_int8_sparse = validate(val_loader, compressed_model, criterion) - - print(f"Accuracy of tuned INT8 sparse model: {acc1_int8_sparse:.3f}") - print(f"Accuracy drop of tuned INT8 sparse model over pre-trained FP32 model: {acc1 - acc1_int8_sparse:.3f}") - -Export INT8 Sparse Model to OpenVINO IR ---------------------------------------- - - - -.. code:: ipython3 - - warnings.filterwarnings("ignore", category=TracerWarning) - warnings.filterwarnings("ignore", category=UserWarning) - # Export INT8 model to OpenVINO™ IR - ov_model = ov.convert_model(compressed_model, example_input=dummy_input, input=[1, 3, image_size, image_size]) - ov.save_model(ov_model, int8_sparse_ir_path) - print(f"INT8 sparse model exported to {int8_sparse_ir_path}.") - -Benchmark Model Performance by Computing Inference Time -------------------------------------------------------- - - - -Finally, measure the inference performance of the ``FP32`` and ``INT8`` -models, using `Benchmark -Tool `__ -- inference performance measurement tool in OpenVINO. By default, -Benchmark Tool runs inference for 60 seconds in asynchronous mode on -CPU. It returns inference speed as latency (milliseconds per image) and -throughput (frames per second) values. - - **NOTE**: This notebook runs ``benchmark_app`` for 15 seconds to give - a quick indication of performance. For more accurate performance, it - is recommended to run ``benchmark_app`` in a terminal/command prompt - after closing other applications. Run - ``benchmark_app -m model.xml -d CPU`` to benchmark async inference on - CPU for one minute. Change CPU to GPU to benchmark on GPU. Run - ``benchmark_app --help`` to see an overview of all command-line - options. - -.. code:: ipython3 - - # Initialize OpenVINO runtime - core = ov.Core() - device = device_widget() - - device - -.. code:: ipython3 - - def parse_benchmark_output(benchmark_output): - parsed_output = [line for line in benchmark_output if "FPS" in line] - print(*parsed_output, sep="\n") - - - print("Benchmark FP32 model (IR)") - benchmark_output = ! benchmark_app -m $fp32_ir_path -d $device.value -api async -t 15 - parse_benchmark_output(benchmark_output) - - print("Benchmark INT8 sparse model (IR)") - benchmark_output = ! benchmark_app -m $int8_ir_path -d $device.value -api async -t 15 - parse_benchmark_output(benchmark_output) - -Show Device Information for reference. - -.. code:: ipython3 - - import openvino.properties as props - - - core.get_property(device.value, props.device.full_name) diff --git a/docs/notebooks/pytorch-to-openvino-with-output.rst b/docs/notebooks/pytorch-to-openvino-with-output.rst deleted file mode 100644 index da531ef789f3be..00000000000000 --- a/docs/notebooks/pytorch-to-openvino-with-output.rst +++ /dev/null @@ -1,848 +0,0 @@ -Convert a PyTorch Model to OpenVINO™ IR -======================================= - -This tutorial demonstrates step-by-step instructions on how to do -inference on a PyTorch classification model using OpenVINO Runtime. -Starting from OpenVINO 2023.0 release, OpenVINO supports direct PyTorch -model conversion without an intermediate step to convert them into ONNX -format. In order, if you try to use the lower OpenVINO version or prefer -to use ONNX, please check this -`tutorial `__. - -In this tutorial, we will use the -`RegNetY_800MF `__ model from -`torchvision `__ to -demonstrate how to convert PyTorch models to OpenVINO Intermediate -Representation. - -The RegNet model was proposed in `Designing Network Design -Spaces `__ by Ilija Radosavovic, Raj -Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. The authors -design search spaces to perform Neural Architecture Search (NAS). They -first start from a high dimensional search space and iteratively reduce -the search space by empirically applying constraints based on the -best-performing models sampled by the current search space. Instead of -focusing on designing individual network instances, authors design -network design spaces that parametrize populations of networks. The -overall process is analogous to the classic manual design of networks -but elevated to the design space level. The RegNet design space provides -simple and fast networks that work well across a wide range of flop -regimes. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Load PyTorch Model <#load-pytorch-model>`__ - - - `Prepare Input Data <#prepare-input-data>`__ - - `Run PyTorch Model Inference <#run-pytorch-model-inference>`__ - - `Benchmark PyTorch Model - Inference <#benchmark-pytorch-model-inference>`__ - -- `Convert PyTorch Model to OpenVINO Intermediate - Representation <#convert-pytorch-model-to-openvino-intermediate-representation>`__ - - - `Select inference device <#select-inference-device>`__ - - `Run OpenVINO Model Inference <#run-openvino-model-inference>`__ - - `Benchmark OpenVINO Model - Inference <#benchmark-openvino-model-inference>`__ - -- `Convert PyTorch Model with Static Input - Shape <#convert-pytorch-model-with-static-input-shape>`__ - - - `Select inference device <#select-inference-device>`__ - - `Run OpenVINO Model Inference with Static Input - Shape <#run-openvino-model-inference-with-static-input-shape>`__ - - `Benchmark OpenVINO Model Inference with Static Input - Shape <#benchmark-openvino-model-inference-with-static-input-shape>`__ - -- `Convert TorchScript Model to OpenVINO Intermediate - Representation <#convert-torchscript-model-to-openvino-intermediate-representation>`__ - - - `Scripted Model <#scripted-model>`__ - - `Benchmark Scripted Model - Inference <#benchmark-scripted-model-inference>`__ - - `Convert PyTorch Scripted Model to OpenVINO Intermediate - Representation <#convert-pytorch-scripted-model-to-openvino-intermediate-representation>`__ - - `Benchmark OpenVINO Model Inference Converted From Scripted - Model <#benchmark-openvino-model-inference-converted-from-scripted-model>`__ - - `Traced Model <#traced-model>`__ - - `Benchmark Traced Model - Inference <#benchmark-traced-model-inference>`__ - - `Convert PyTorch Traced Model to OpenVINO Intermediate - Representation <#convert-pytorch-traced-model-to-openvino-intermediate-representation>`__ - - `Benchmark OpenVINO Model Inference Converted From Traced - Model <#benchmark-openvino-model-inference-converted-from-traced-model>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Install notebook dependencies - -.. code:: ipython3 - - %pip install -q "openvino>=2023.1.0" scipy Pillow torch torchvision --extra-index-url https://download.pytorch.org/whl/cpu - -Download input data and label map - -.. code:: ipython3 - - import requests - from pathlib import Path - from PIL import Image - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("pytorch-to-openvino.ipynb") - - MODEL_DIR = Path("model") - DATA_DIR = Path("data") - - MODEL_DIR.mkdir(exist_ok=True) - DATA_DIR.mkdir(exist_ok=True) - MODEL_NAME = "regnet_y_800mf" - - input_img_path = Path("test.jpg") - if not input_img_path.exists(): - image = Image.open(requests.get("https://farm9.staticflickr.com/8225/8511402100_fea15da1c5_z.jpg", stream=True).raw) - image.save(input_img_path) - else: - image = Image.open(input_img_path) - - labels_file = DATA_DIR / "imagenet_2012.txt" - - if not labels_file.exists(): - resp = requests.get("https://raw.githubusercontent.com/openvinotoolkit/open_model_zoo/master/data/dataset_classes/imagenet_2012.txt") - with labels_file.open("wb") as f: - f.write(resp.content) - - imagenet_classes = labels_file.open("r").read().splitlines() - -Load PyTorch Model ------------------- - - - -Generally, PyTorch models represent an instance of the -``torch.nn.Module`` class, initialized by a state dictionary with model -weights. Typical steps for getting a pre-trained model: - -1. Create an instance of a model class -2. Load checkpoint state dict, which contains pre-trained model weights -3. Turn the model to evaluation for switching some operations to - inference mode - -The ``torchvision`` module provides a ready-to-use set of functions for -model class initialization. We will use -``torchvision.models.regnet_y_800mf``. You can directly pass pre-trained -model weights to the model initialization function using the weights -enum ``RegNet_Y_800MF_Weights.DEFAULT``. - -.. code:: ipython3 - - import torchvision - - # get default weights using available weights Enum for model - weights = torchvision.models.RegNet_Y_800MF_Weights.DEFAULT - - # create model topology and load weights - model = torchvision.models.regnet_y_800mf(weights=weights) - - # switch model to inference mode - model.eval(); - -Prepare Input Data -~~~~~~~~~~~~~~~~~~ - - - -The code below demonstrates how to preprocess input data using a -model-specific transforms module from ``torchvision``. After -transformation, we should concatenate images into batched tensor, in our -case, we will run the model with batch 1, so we just unsqueeze input on -the first dimension. - -.. code:: ipython3 - - import torch - - # Initialize the Weight Transforms - preprocess = weights.transforms() - - # Apply it to the input image - img_transformed = preprocess(image) - - # Add batch dimension to image tensor - input_tensor = img_transformed.unsqueeze(0) - -Run PyTorch Model Inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The model returns a vector of probabilities in raw logits format, -softmax can be applied to get normalized values in the [0, 1] range. For -a demonstration that the output of the original model and OpenVINO -converted is the same, we defined a common postprocessing function which -can be reused later. - -.. code:: ipython3 - - import numpy as np - from scipy.special import softmax - - # Perform model inference on input tensor - result = model(input_tensor) - - - # Postprocessing function for getting results in the same way for both PyTorch model inference and OpenVINO - def postprocess_result(output_tensor: np.ndarray, top_k: int = 5): - """ - Posprocess model results. This function applied sofrmax on output tensor and returns specified top_k number of labels with highest probability - Parameters: - output_tensor (np.ndarray): model output tensor with probabilities - top_k (int, *optional*, default 5): number of labels with highest probability for return - Returns: - topk_labels: label ids for selected top_k scores - topk_scores: selected top_k highest scores predicted by model - """ - softmaxed_scores = softmax(output_tensor, -1)[0] - topk_labels = np.argsort(softmaxed_scores)[-top_k:][::-1] - topk_scores = softmaxed_scores[topk_labels] - return topk_labels, topk_scores - - - # Postprocess results - top_labels, top_scores = postprocess_result(result.detach().numpy()) - - # Show results - display(image) - for idx, (label, score) in enumerate(zip(top_labels, top_scores)): - _, predicted_label = imagenet_classes[label].split(" ", 1) - print(f"{idx + 1}: {predicted_label} - {score * 100 :.2f}%") - - - -.. image:: pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_11_0.png - - -.. parsed-literal:: - - 1: tiger cat - 25.91% - 2: Egyptian cat - 10.26% - 3: computer keyboard, keypad - 9.22% - 4: tabby, tabby cat - 9.09% - 5: hamper - 2.35% - - -Benchmark PyTorch Model Inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%timeit - - # Run model inference - model(input_tensor) - - -.. parsed-literal:: - - 21.1 ms ± 602 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) - - -Convert PyTorch Model to OpenVINO Intermediate Representation -------------------------------------------------------------- - - - -Starting from the 2023.0 release OpenVINO supports direct PyTorch models -conversion to OpenVINO Intermediate Representation (IR) format. OpenVINO -model conversion API should be used for these purposes. More details -regarding PyTorch model conversion can be found in OpenVINO -`documentation `__ - -The ``convert_model`` function accepts the PyTorch model object and -returns the ``openvino.Model`` instance ready to load on a device using -``core.compile_model`` or save on disk for next usage using -``ov.save_model``. Optionally, we can provide additional parameters, -such as: - -- ``compress_to_fp16`` - flag to perform model weights compression into - FP16 data format. It may reduce the required space for model storage - on disk and give speedup for inference devices, where FP16 - calculation is supported. -- ``example_input`` - input data sample which can be used for model - tracing. -- ``input_shape`` - the shape of input tensor for conversion - -and any other advanced options supported by model conversion Python API. -More details can be found on this -`page `__ - -.. code:: ipython3 - - import openvino as ov - - # Create OpenVINO Core object instance - core = ov.Core() - - # Convert model to openvino.runtime.Model object - ov_model = ov.convert_model(model) - - # Save openvino.runtime.Model object on disk - ov.save_model(ov_model, MODEL_DIR / f"{MODEL_NAME}_dynamic.xml") - - ov_model - - - - -.. parsed-literal:: - - - ] - outputs[ - - ]> - - - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - # Load OpenVINO model on device - compiled_model = core.compile_model(ov_model, device.value) - compiled_model - - - - -.. parsed-literal:: - - - ] - outputs[ - - ]> - - - -Run OpenVINO Model Inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # Run model inference - result = compiled_model(input_tensor)[0] - - # Posptorcess results - top_labels, top_scores = postprocess_result(result) - - # Show results - display(image) - for idx, (label, score) in enumerate(zip(top_labels, top_scores)): - _, predicted_label = imagenet_classes[label].split(" ", 1) - print(f"{idx + 1}: {predicted_label} - {score * 100 :.2f}%") - - - -.. image:: pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_20_0.png - - -.. parsed-literal:: - - 1: tiger cat - 25.91% - 2: Egyptian cat - 10.26% - 3: computer keyboard, keypad - 9.22% - 4: tabby, tabby cat - 9.09% - 5: hamper - 2.35% - - -Benchmark OpenVINO Model Inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%timeit - - compiled_model(input_tensor) - - -.. parsed-literal:: - - 3.53 ms ± 25.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) - - -Convert PyTorch Model with Static Input Shape ---------------------------------------------- - - - -The default conversion path preserves dynamic input shapes, in order if -you want to convert the model with static shapes, you can explicitly -specify it during conversion using the ``input_shape`` parameter or -reshape the model into the desired shape after conversion. For the model -reshaping example please check the following -`tutorial `__. - -.. code:: ipython3 - - # Convert model to openvino.runtime.Model object - ov_model = ov.convert_model(model, input=[[1, 3, 224, 224]]) - # Save openvino.runtime.Model object on disk - ov.save_model(ov_model, MODEL_DIR / f"{MODEL_NAME}_static.xml") - ov_model - - - - -.. parsed-literal:: - - - ] - outputs[ - - ]> - - - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - # Load OpenVINO model on device - compiled_model = core.compile_model(ov_model, device.value) - compiled_model - - - - -.. parsed-literal:: - - - ] - outputs[ - - ]> - - - -Now, we can see that input of our converted model is tensor of shape [1, -3, 224, 224] instead of [?, 3, ?, ?] reported by previously converted -model. - -Run OpenVINO Model Inference with Static Input Shape -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # Run model inference - result = compiled_model(input_tensor)[0] - - # Posptorcess results - top_labels, top_scores = postprocess_result(result) - - # Show results - display(image) - for idx, (label, score) in enumerate(zip(top_labels, top_scores)): - _, predicted_label = imagenet_classes[label].split(" ", 1) - print(f"{idx + 1}: {predicted_label} - {score * 100 :.2f}%") - - - -.. image:: pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_31_0.png - - -.. parsed-literal:: - - 1: tiger cat - 25.91% - 2: Egyptian cat - 10.26% - 3: computer keyboard, keypad - 9.22% - 4: tabby, tabby cat - 9.09% - 5: hamper - 2.35% - - -Benchmark OpenVINO Model Inference with Static Input Shape -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%timeit - - compiled_model(input_tensor) - - -.. parsed-literal:: - - 3.25 ms ± 73 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) - - -Convert TorchScript Model to OpenVINO Intermediate Representation ------------------------------------------------------------------ - - - -TorchScript is a way to create serializable and optimizable models from -PyTorch code. Any TorchScript program can be saved from a Python process -and loaded in a process where there is no Python dependency. More -details about TorchScript can be found in `PyTorch -documentation `__. - -There are 2 possible ways to convert the PyTorch model to TorchScript: - -- ``torch.jit.script`` - Scripting a function or ``nn.Module`` will - inspect the source code, compile it as TorchScript code using the - TorchScript compiler, and return a ``ScriptModule`` or - ``ScriptFunction``. -- ``torch.jit.trace`` - Trace a function and return an executable or - ``ScriptFunction`` that will be optimized using just-in-time - compilation. - -Let’s consider both approaches and their conversion into OpenVINO IR. - -Scripted Model -~~~~~~~~~~~~~~ - - - -``torch.jit.script`` inspects model source code and compiles it to -``ScriptModule``. After compilation model can be used for inference or -saved on disk using the ``torch.jit.save`` function and after that -restored with ``torch.jit.load`` in any other environment without the -original PyTorch model code definitions. - -TorchScript itself is a subset of the Python language, so not all -features in Python work, but TorchScript provides enough functionality -to compute on tensors and do control-dependent operations. For a -complete guide, see the `TorchScript Language -Reference `__. - -.. code:: ipython3 - - # Get model path - scripted_model_path = MODEL_DIR / f"{MODEL_NAME}_scripted.pth" - - # Compile and save model if it has not been compiled before or load compiled model - if not scripted_model_path.exists(): - scripted_model = torch.jit.script(model) - torch.jit.save(scripted_model, scripted_model_path) - else: - scripted_model = torch.jit.load(scripted_model_path) - - # Run scripted model inference - result = scripted_model(input_tensor) - - # Postprocess results - top_labels, top_scores = postprocess_result(result.detach().numpy()) - - # Show results - display(image) - for idx, (label, score) in enumerate(zip(top_labels, top_scores)): - _, predicted_label = imagenet_classes[label].split(" ", 1) - print(f"{idx + 1}: {predicted_label} - {score * 100 :.2f}%") - - - -.. image:: pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_35_0.png - - -.. parsed-literal:: - - 1: tiger cat - 25.91% - 2: Egyptian cat - 10.26% - 3: computer keyboard, keypad - 9.22% - 4: tabby, tabby cat - 9.09% - 5: hamper - 2.35% - - -Benchmark Scripted Model Inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%timeit - - scripted_model(input_tensor) - - -.. parsed-literal:: - - 18.3 ms ± 1.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) - - -Convert PyTorch Scripted Model to OpenVINO Intermediate Representation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The conversion step for the scripted model to OpenVINO IR is similar to -the original PyTorch model. - -.. code:: ipython3 - - # Convert model to openvino.runtime.Model object - ov_model = ov.convert_model(scripted_model) - - # Load OpenVINO model on device - compiled_model = core.compile_model(ov_model, device.value) - - # Run OpenVINO model inference - result = compiled_model(input_tensor, device.value)[0] - - # Postprocess results - top_labels, top_scores = postprocess_result(result) - - # Show results - display(image) - for idx, (label, score) in enumerate(zip(top_labels, top_scores)): - _, predicted_label = imagenet_classes[label].split(" ", 1) - print(f"{idx + 1}: {predicted_label} - {score * 100 :.2f}%") - - - -.. image:: pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_39_0.png - - -.. parsed-literal:: - - 1: tiger cat - 25.91% - 2: Egyptian cat - 10.26% - 3: computer keyboard, keypad - 9.22% - 4: tabby, tabby cat - 9.09% - 5: hamper - 2.35% - - -Benchmark OpenVINO Model Inference Converted From Scripted Model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%timeit - - compiled_model(input_tensor) - - -.. parsed-literal:: - - 3.45 ms ± 72.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) - - -Traced Model -~~~~~~~~~~~~ - - - -Using ``torch.jit.trace``, you can turn an existing module or Python -function into a TorchScript ``ScriptFunction`` or ``ScriptModule``. You -must provide example inputs, and model will be executed, recording the -operations performed on all the tensors. - -- The resulting recording of a standalone function produces - ``ScriptFunction``. - -- The resulting recording of ``nn.Module.forward`` or ``nn.Module`` - produces ``ScriptModule``. - -In the same way like scripted model, traced model can be used for -inference or saved on disk using ``torch.jit.save`` function and after -that restored with ``torch.jit.load`` in any other environment without -original PyTorch model code definitions. - -.. code:: ipython3 - - # Get model path - traced_model_path = MODEL_DIR / f"{MODEL_NAME}_traced.pth" - - # Trace and save model if it has not been traced before or load traced model - if not traced_model_path.exists(): - traced_model = torch.jit.trace(model, example_inputs=input_tensor) - torch.jit.save(traced_model, traced_model_path) - else: - traced_model = torch.jit.load(traced_model_path) - - # Run traced model inference - result = traced_model(input_tensor) - - # Postprocess results - top_labels, top_scores = postprocess_result(result.detach().numpy()) - - # Show results - display(image) - for idx, (label, score) in enumerate(zip(top_labels, top_scores)): - _, predicted_label = imagenet_classes[label].split(" ", 1) - print(f"{idx + 1}: {predicted_label} - {score * 100 :.2f}%") - - - -.. image:: pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_43_0.png - - -.. parsed-literal:: - - 1: tiger cat - 25.91% - 2: Egyptian cat - 10.26% - 3: computer keyboard, keypad - 9.22% - 4: tabby, tabby cat - 9.09% - 5: hamper - 2.35% - - -Benchmark Traced Model Inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%timeit - - traced_model(input_tensor) - - -.. parsed-literal:: - - 18 ms ± 365 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) - - -Convert PyTorch Traced Model to OpenVINO Intermediate Representation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The conversion step for a traced model to OpenVINO IR is similar to the -original PyTorch model. - -.. code:: ipython3 - - # Convert model to openvino.runtime.Model object - ov_model = ov.convert_model(traced_model) - - # Load OpenVINO model on device - compiled_model = core.compile_model(ov_model, device.value) - - # Run OpenVINO model inference - result = compiled_model(input_tensor)[0] - - # Postprocess results - top_labels, top_scores = postprocess_result(result) - - # Show results - display(image) - for idx, (label, score) in enumerate(zip(top_labels, top_scores)): - _, predicted_label = imagenet_classes[label].split(" ", 1) - print(f"{idx + 1}: {predicted_label} - {score * 100 :.2f}%") - - - -.. image:: pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_47_0.png - - -.. parsed-literal:: - - 1: tiger cat - 25.91% - 2: Egyptian cat - 10.26% - 3: computer keyboard, keypad - 9.22% - 4: tabby, tabby cat - 9.09% - 5: hamper - 2.35% - - -Benchmark OpenVINO Model Inference Converted From Traced Model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%timeit - - compiled_model(input_tensor)[0] - - -.. parsed-literal:: - - 3.63 ms ± 43.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) - diff --git a/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_11_0.png b/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_11_0.png deleted file mode 100644 index 74fb9b51618f21..00000000000000 --- a/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_11_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e1cee056787362b7bcfed1d50822d2ce27e3df9a769849b5b2a8e28c5dcf58e9 -size 538097 diff --git a/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_20_0.png b/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_20_0.png deleted file mode 100644 index 74fb9b51618f21..00000000000000 --- a/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_20_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e1cee056787362b7bcfed1d50822d2ce27e3df9a769849b5b2a8e28c5dcf58e9 -size 538097 diff --git a/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_31_0.png b/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_31_0.png deleted file mode 100644 index 74fb9b51618f21..00000000000000 --- a/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_31_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e1cee056787362b7bcfed1d50822d2ce27e3df9a769849b5b2a8e28c5dcf58e9 -size 538097 diff --git a/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_35_0.png b/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_35_0.png deleted file mode 100644 index 74fb9b51618f21..00000000000000 --- a/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_35_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e1cee056787362b7bcfed1d50822d2ce27e3df9a769849b5b2a8e28c5dcf58e9 -size 538097 diff --git a/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_39_0.png b/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_39_0.png deleted file mode 100644 index 74fb9b51618f21..00000000000000 --- a/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_39_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e1cee056787362b7bcfed1d50822d2ce27e3df9a769849b5b2a8e28c5dcf58e9 -size 538097 diff --git a/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_43_0.png b/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_43_0.png deleted file mode 100644 index 74fb9b51618f21..00000000000000 --- a/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_43_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e1cee056787362b7bcfed1d50822d2ce27e3df9a769849b5b2a8e28c5dcf58e9 -size 538097 diff --git a/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_47_0.png b/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_47_0.png deleted file mode 100644 index 74fb9b51618f21..00000000000000 --- a/docs/notebooks/pytorch-to-openvino-with-output_files/pytorch-to-openvino-with-output_47_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e1cee056787362b7bcfed1d50822d2ce27e3df9a769849b5b2a8e28c5dcf58e9 -size 538097 diff --git a/docs/notebooks/qrcode-monster-with-output.rst b/docs/notebooks/qrcode-monster-with-output.rst deleted file mode 100644 index 872973a6cd78a7..00000000000000 --- a/docs/notebooks/qrcode-monster-with-output.rst +++ /dev/null @@ -1,1500 +0,0 @@ -Generate creative QR codes with ControlNet QR Code Monster and OpenVINO™ -======================================================================== - -`Stable Diffusion `__, a -cutting-edge image generation technique, but it can be further enhanced -by combining it with `ControlNet `__, -a widely used control network approach. The combination allows Stable -Diffusion to use a condition input to guide the image generation -process, resulting in highly accurate and visually appealing images. The -condition input could be in the form of various types of data such as -scribbles, edge maps, pose key points, depth maps, segmentation maps, -normal maps, or any other relevant information that helps to guide the -content of the generated image, for example - QR codes! This method can -be particularly useful in complex image generation scenarios where -precise control and fine-tuning are required to achieve the desired -results. - -In this tutorial, we will learn how to convert and run `Controlnet QR -Code Monster For -SD-1.5 `__ -by `monster-labs `__. An additional part -demonstrates how to run quantization with -`NNCF `__ to speed up -pipeline. - -|image0| - -If you want to learn more about ControlNet and particularly on -conditioning by pose, please refer to this -`tutorial `__ - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Instantiating Generation - Pipeline <#instantiating-generation-pipeline>`__ - - - `ControlNet in Diffusers - library <#controlnet-in-diffusers-library>`__ - -- `Convert models to OpenVINO Intermediate representation (IR) - format <#convert-models-to-openvino-intermediate-representation-ir-format>`__ - - - `ControlNet conversion <#controlnet-conversion>`__ - - `Text Encoder <#text-encoder>`__ - - `UNet conversion <#unet-conversion>`__ - - `VAE Decoder conversion <#vae-decoder-conversion>`__ - -- `Select inference device for Stable Diffusion - pipeline <#select-inference-device-for-stable-diffusion-pipeline>`__ -- `Prepare Inference pipeline <#prepare-inference-pipeline>`__ -- `Quantization <#quantization>`__ - - - `Prepare calibration datasets <#prepare-calibration-datasets>`__ - - `Run quantization <#run-quantization>`__ - - `Compare model file sizes <#compare-model-file-sizes>`__ - - `Compare inference time of the FP16 and INT8 - pipelines <#compare-inference-time-of-the-fp16-and-int8-pipelines>`__ - -- `Running Text-to-Image Generation with ControlNet Conditioning and - OpenVINO <#running-text-to-image-generation-with-controlnet-conditioning-and-openvino>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. |image0| image:: https://github.com/openvinotoolkit/openvino_notebooks/assets/76463150/1a5978c6-e7a0-4824-9318-a3d8f4912c47 - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q accelerate "diffusers>=0.24.0" transformers "torch>=2.1" "gradio>=4.19" qrcode opencv-python "peft>=0.6.2" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "openvino>=2023.1.0" "nncf>=2.7.0" "matplotlib>=3.4" - -Instantiating Generation Pipeline ---------------------------------- - - - -ControlNet in Diffusers library -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -For working with Stable Diffusion and ControlNet models, we will use -Hugging Face `Diffusers `__ -library. To experiment with ControlNet, Diffusers exposes the -`StableDiffusionControlNetPipeline `__ -similar to the `other Diffusers -pipelines `__. -Central to the ``StableDiffusionControlNetPipeline`` is the -``controlnet`` argument which enables providing a particularly trained -`ControlNetModel `__ -instance while keeping the pre-trained diffusion model weights the same. -The code below demonstrates how to create -``StableDiffusionControlNetPipeline``, using the ``controlnet-openpose`` -controlnet model and ``stable-diffusion-v1-5``: - -.. code:: ipython3 - - from diffusers import ( - StableDiffusionControlNetPipeline, - ControlNetModel, - ) - - from pathlib import Path - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("qrcode-monster.ipynb") - - controlnet = ControlNetModel.from_pretrained("monster-labs/control_v1p_sd15_qrcode_monster") - - pipe = StableDiffusionControlNetPipeline.from_pretrained( - "botp/stable-diffusion-v1-5", - controlnet=controlnet, - ) - - -.. parsed-literal:: - - 2024-12-05 09:21:58.637418: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-12-05 09:21:58.649752: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered - WARNING: All log messages before absl::InitializeLog() is called are written to STDERR - E0000 00:00:1733376118.663808 222102 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered - E0000 00:00:1733376118.667978 222102 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered - 2024-12-05 09:21:58.683751: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - - - -.. parsed-literal:: - - Loading pipeline components...: 0%| | 0/7 [00:00`__. - -.. code:: ipython3 - - vae_ir_path = Path("./vae.xml") - - - class VAEDecoderWrapper(torch.nn.Module): - def __init__(self, vae): - super().__init__() - vae.eval() - self.vae = vae - - def forward(self, latents): - return self.vae.decode(latents) - - - if not vae_ir_path.exists(): - vae_decoder = VAEDecoderWrapper(pipe.vae) - latents = torch.zeros((1, 4, 96, 96)) - - vae_decoder.eval() - with torch.no_grad(): - ov_model = ov.convert_model(vae_decoder, example_input=latents) - ov.save_model(ov_model, vae_ir_path) - del ov_model - del pipe.vae - cleanup_torchscript_cache() - print("VAE decoder successfully converted to IR") - else: - del pipe.vae - print(f"VAE decoder will be loaded from {vae_ir_path}") - - -.. parsed-literal:: - - VAE decoder will be loaded from vae.xml - - -Select inference device for Stable Diffusion pipeline ------------------------------------------------------ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Prepare Inference pipeline --------------------------- - - - -The stable diffusion model takes both a latent seed and a text prompt as -input. The latent seed is then used to generate random latent image -representations of size :math:`96 \times 96` where as the text prompt is -transformed to text embeddings of size :math:`77 \times 768` via CLIP’s -text encoder. - -Next, the U-Net iteratively *denoises* the random latent image -representations while being conditioned on the text embeddings. In -comparison with the original stable-diffusion pipeline, latent image -representation, encoder hidden states, and control condition annotation -passed via ControlNet on each denoising step for obtaining middle and -down blocks attention parameters, these attention blocks results -additionally will be provided to the UNet model for the control -generation process. The output of the U-Net, being the noise residual, -is used to compute a denoised latent image representation via a -scheduler algorithm. Many different scheduler algorithms can be used for -this computation, each having its pros and cons. For Stable Diffusion, -it is recommended to use one of: - -- `PNDM - scheduler `__ -- `DDIM - scheduler `__ -- `K-LMS - scheduler `__ - -Theory on how the scheduler algorithm function works is out of scope for -this notebook, but in short, you should remember that they compute the -predicted denoised image representation from the previous noise -representation and the predicted noise residual. For more information, -it is recommended to look into `Elucidating the Design Space of -Diffusion-Based Generative Models `__ - -In this tutorial, instead of using Stable Diffusion’s default -`PNDMScheduler `__, -we use -`EulerAncestralDiscreteScheduler `__, -recommended by authors. More information regarding schedulers can be -found -`here `__. - -The *denoising* process is repeated a given number of times (by default -50) to step-by-step retrieve better latent image representations. Once -complete, the latent image representation is decoded by the decoder part -of the variational auto-encoder. - -Similarly to Diffusers ``StableDiffusionControlNetPipeline``, we define -our own ``OVContrlNetStableDiffusionPipeline`` inference pipeline based -on OpenVINO. - -.. code:: ipython3 - - from diffusers import DiffusionPipeline - from transformers import CLIPTokenizer - from typing import Union, List, Optional, Tuple - import cv2 - import numpy as np - - - def scale_fit_to_window(dst_width: int, dst_height: int, image_width: int, image_height: int): - """ - Preprocessing helper function for calculating image size for resize with peserving original aspect ratio - and fitting image to specific window size - - Parameters: - dst_width (int): destination window width - dst_height (int): destination window height - image_width (int): source image width - image_height (int): source image height - Returns: - result_width (int): calculated width for resize - result_height (int): calculated height for resize - """ - im_scale = min(dst_height / image_height, dst_width / image_width) - return int(im_scale * image_width), int(im_scale * image_height) - - - def preprocess(image: Image.Image): - """ - Image preprocessing function. Takes image in PIL.Image format, resizes it to keep aspect ration and fits to model input window 768x768, - then converts it to np.ndarray and adds padding with zeros on right or bottom side of image (depends from aspect ratio), after that - converts data to float32 data type and change range of values from [0, 255] to [-1, 1], finally, converts data layout from planar NHWC to NCHW. - The function returns preprocessed input tensor and padding size, which can be used in postprocessing. - - Parameters: - image (Image.Image): input image - Returns: - image (np.ndarray): preprocessed image tensor - pad (Tuple[int]): pading size for each dimension for restoring image size in postprocessing - """ - src_width, src_height = image.size - dst_width, dst_height = scale_fit_to_window(768, 768, src_width, src_height) - image = image.convert("RGB") - image = np.array(image.resize((dst_width, dst_height), resample=Image.Resampling.LANCZOS))[None, :] - pad_width = 768 - dst_width - pad_height = 768 - dst_height - pad = ((0, 0), (0, pad_height), (0, pad_width), (0, 0)) - image = np.pad(image, pad, mode="constant") - image = image.astype(np.float32) / 255.0 - image = image.transpose(0, 3, 1, 2) - return image, pad - - - def randn_tensor( - shape: Union[Tuple, List], - dtype: Optional[np.dtype] = np.float32, - ): - """ - Helper function for generation random values tensor with given shape and data type - - Parameters: - shape (Union[Tuple, List]): shape for filling random values - dtype (np.dtype, *optiona*, np.float32): data type for result - Returns: - latents (np.ndarray): tensor with random values with given data type and shape (usually represents noise in latent space) - """ - latents = np.random.randn(*shape).astype(dtype) - - return latents - - - class OVContrlNetStableDiffusionPipeline(DiffusionPipeline): - """ - OpenVINO inference pipeline for Stable Diffusion with ControlNet guidence - """ - - def __init__( - self, - tokenizer: CLIPTokenizer, - scheduler, - core: ov.Core, - controlnet: ov.Model, - text_encoder: ov.Model, - unet: ov.Model, - vae_decoder: ov.Model, - device: str = "AUTO", - ): - super().__init__() - self.tokenizer = tokenizer - self.vae_scale_factor = 8 - self.scheduler = scheduler - self.load_models(core, device, controlnet, text_encoder, unet, vae_decoder) - self.set_progress_bar_config(disable=True) - - def load_models( - self, - core: ov.Core, - device: str, - controlnet: ov.Model, - text_encoder: ov.Model, - unet: ov.Model, - vae_decoder: ov.Model, - ): - """ - Function for loading models on device using OpenVINO - - Parameters: - core (Core): OpenVINO runtime Core class instance - device (str): inference device - controlnet (Model): OpenVINO Model object represents ControlNet - text_encoder (Model): OpenVINO Model object represents text encoder - unet (Model): OpenVINO Model object represents UNet - vae_decoder (Model): OpenVINO Model object represents vae decoder - Returns - None - """ - self.text_encoder = core.compile_model(text_encoder, device) - self.text_encoder_out = self.text_encoder.output(0) - self.register_to_config(controlnet=core.compile_model(controlnet, device)) - self.register_to_config(unet=core.compile_model(unet, device)) - self.unet_out = self.unet.output(0) - self.vae_decoder = core.compile_model(vae_decoder, device) - self.vae_decoder_out = self.vae_decoder.output(0) - - def __call__( - self, - prompt: Union[str, List[str]], - image: Image.Image, - num_inference_steps: int = 10, - negative_prompt: Union[str, List[str]] = None, - guidance_scale: float = 7.5, - controlnet_conditioning_scale: float = 1.0, - eta: float = 0.0, - latents: Optional[np.array] = None, - output_type: Optional[str] = "pil", - ): - """ - Function invoked when calling the pipeline for generation. - - Parameters: - prompt (`str` or `List[str]`): - The prompt or prompts to guide the image generation. - image (`Image.Image`): - `Image`, or tensor representing an image batch which will be repainted according to `prompt`. - num_inference_steps (`int`, *optional*, defaults to 100): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - negative_prompt (`str` or `List[str]`): - negative prompt or prompts for generation - guidance_scale (`float`, *optional*, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. This pipeline requires a value of at least `1`. - latents (`np.ndarray`, *optional*): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `Image.Image` or `np.array`. - Returns: - image ([List[Union[np.ndarray, Image.Image]]): generaited images - - """ - - # 1. Define call parameters - batch_size = 1 if isinstance(prompt, str) else len(prompt) - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - # 2. Encode input prompt - text_embeddings = self._encode_prompt(prompt, negative_prompt=negative_prompt) - - # 3. Preprocess image - orig_width, orig_height = image.size - image, pad = preprocess(image) - height, width = image.shape[-2:] - if do_classifier_free_guidance: - image = np.concatenate(([image] * 2)) - - # 4. set timesteps - self.scheduler.set_timesteps(num_inference_steps) - timesteps = self.scheduler.timesteps - - # 6. Prepare latent variables - num_channels_latents = 4 - latents = self.prepare_latents( - batch_size, - num_channels_latents, - height, - width, - text_embeddings.dtype, - latents, - ) - - # 7. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - # Expand the latents if we are doing classifier free guidance. - # The latents are expanded 3 times because for pix2pix the guidance\ - # is applied for both the text and the input image. - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - - result = self.controlnet([latent_model_input, t, text_embeddings, image]) - down_and_mid_blok_samples = [sample * controlnet_conditioning_scale for _, sample in result.items()] - - # predict the noise residual - noise_pred = self.unet([latent_model_input, t, text_embeddings, *down_and_mid_blok_samples])[self.unet_out] - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1] - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(torch.from_numpy(noise_pred), t, torch.from_numpy(latents)).prev_sample.numpy() - - # update progress - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - - # 8. Post-processing - image = self.decode_latents(latents, pad) - - # 9. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) - image = [img.resize((orig_width, orig_height), Image.Resampling.LANCZOS) for img in image] - else: - image = [cv2.resize(img, (orig_width, orig_width)) for img in image] - - return image - - def _encode_prompt( - self, - prompt: Union[str, List[str]], - num_images_per_prompt: int = 1, - do_classifier_free_guidance: bool = True, - negative_prompt: Union[str, List[str]] = None, - ): - """ - Encodes the prompt into text encoder hidden states. - - Parameters: - prompt (str or list(str)): prompt to be encoded - num_images_per_prompt (int): number of images that should be generated per prompt - do_classifier_free_guidance (bool): whether to use classifier free guidance or not - negative_prompt (str or list(str)): negative prompt to be encoded - Returns: - text_embeddings (np.ndarray): text encoder hidden states - """ - batch_size = len(prompt) if isinstance(prompt, list) else 1 - - # tokenize input prompts - text_inputs = self.tokenizer( - prompt, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="np", - ) - text_input_ids = text_inputs.input_ids - - text_embeddings = self.text_encoder(text_input_ids)[self.text_encoder_out] - - # duplicate text embeddings for each generation per prompt - if num_images_per_prompt != 1: - bs_embed, seq_len, _ = text_embeddings.shape - text_embeddings = np.tile(text_embeddings, (1, num_images_per_prompt, 1)) - text_embeddings = np.reshape(text_embeddings, (bs_embed * num_images_per_prompt, seq_len, -1)) - - # get unconditional embeddings for classifier free guidance - if do_classifier_free_guidance: - uncond_tokens: List[str] - max_length = text_input_ids.shape[-1] - if negative_prompt is None: - uncond_tokens = [""] * batch_size - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] - else: - uncond_tokens = negative_prompt - uncond_input = self.tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="np", - ) - - uncond_embeddings = self.text_encoder(uncond_input.input_ids)[self.text_encoder_out] - - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - seq_len = uncond_embeddings.shape[1] - uncond_embeddings = np.tile(uncond_embeddings, (1, num_images_per_prompt, 1)) - uncond_embeddings = np.reshape(uncond_embeddings, (batch_size * num_images_per_prompt, seq_len, -1)) - - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - text_embeddings = np.concatenate([uncond_embeddings, text_embeddings]) - - return text_embeddings - - def prepare_latents( - self, - batch_size: int, - num_channels_latents: int, - height: int, - width: int, - dtype: np.dtype = np.float32, - latents: np.ndarray = None, - ): - """ - Preparing noise to image generation. If initial latents are not provided, they will be generated randomly, - then prepared latents scaled by the standard deviation required by the scheduler - - Parameters: - batch_size (int): input batch size - num_channels_latents (int): number of channels for noise generation - height (int): image height - width (int): image width - dtype (np.dtype, *optional*, np.float32): dtype for latents generation - latents (np.ndarray, *optional*, None): initial latent noise tensor, if not provided will be generated - Returns: - latents (np.ndarray): scaled initial noise for diffusion - """ - shape = ( - batch_size, - num_channels_latents, - height // self.vae_scale_factor, - width // self.vae_scale_factor, - ) - if latents is None: - latents = randn_tensor(shape, dtype=dtype) - else: - latents = latents - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * np.array(self.scheduler.init_noise_sigma) - return latents - - def decode_latents(self, latents: np.array, pad: Tuple[int]): - """ - Decode predicted image from latent space using VAE Decoder and unpad image result - - Parameters: - latents (np.ndarray): image encoded in diffusion latent space - pad (Tuple[int]): each side padding sizes obtained on preprocessing step - Returns: - image: decoded by VAE decoder image - """ - latents = 1 / 0.18215 * latents - image = self.vae_decoder(latents)[self.vae_decoder_out] - (_, end_h), (_, end_w) = pad[1:3] - h, w = image.shape[2:] - unpad_h = h - end_h - unpad_w = w - end_w - image = image[:, :, :unpad_h, :unpad_w] - image = np.clip(image / 2 + 0.5, 0, 1) - image = np.transpose(image, (0, 2, 3, 1)) - return image - -.. code:: ipython3 - - import qrcode - - - def create_code(content: str): - """Creates QR codes with provided content.""" - qr = qrcode.QRCode( - version=1, - error_correction=qrcode.constants.ERROR_CORRECT_H, - box_size=16, - border=0, - ) - qr.add_data(content) - qr.make(fit=True) - img = qr.make_image(fill_color="black", back_color="white") - img.save("base_qrcode.png") - img = Image.open("base_qrcode.png") - - # find smallest image size multiple of 256 that can fit qr - offset_min = 8 * 16 - w, h = img.size - w = (w + 255 + offset_min) // 256 * 256 - h = (h + 255 + offset_min) // 256 * 256 - if w > 1024: - raise RuntimeError("QR code is too large, please use a shorter content") - bg = Image.new("L", (w, h), 128) - - # align on 16px grid - coords = ((w - img.size[0]) // 2 // 16 * 16, (h - img.size[1]) // 2 // 16 * 16) - bg.paste(img, coords) - return bg - -.. code:: ipython3 - - from transformers import CLIPTokenizer - from diffusers import EulerAncestralDiscreteScheduler - - tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") - scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) - - core = ov.Core() - - ov_pipe = OVContrlNetStableDiffusionPipeline( - tokenizer, - scheduler, - core, - controlnet_ir_path, - text_encoder_ir_path, - unet_ir_path, - vae_ir_path, - device=device.value, - ) - - -.. parsed-literal:: - - /home/ea/work/py311/lib/python3.11/site-packages/diffusers/configuration_utils.py:140: FutureWarning: Accessing config attribute `unet` directly via 'OVContrlNetStableDiffusionPipeline' object attribute is deprecated. Please access 'unet' over 'OVContrlNetStableDiffusionPipeline's config object instead, e.g. 'scheduler.config.unet'. - deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False) - - -Now, let’s see model in action - -.. code:: ipython3 - - np.random.seed(42) - - qrcode_image = create_code("Hi OpenVINO") - image = ov_pipe( - "cozy town on snowy mountain slope 8k", - qrcode_image, - negative_prompt="blurry unreal occluded", - num_inference_steps=25, - guidance_scale=7.7, - controlnet_conditioning_scale=1.4, - )[0] - - image - - -.. parsed-literal:: - - /home/ea/work/py311/lib/python3.11/site-packages/diffusers/configuration_utils.py:140: FutureWarning: Accessing config attribute `controlnet` directly via 'OVContrlNetStableDiffusionPipeline' object attribute is deprecated. Please access 'controlnet' over 'OVContrlNetStableDiffusionPipeline's config object instead, e.g. 'scheduler.config.controlnet'. - deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False) - - - - -.. image:: qrcode-monster-with-output_files/qrcode-monster-with-output_22_1.png - - - -Quantization ------------- - - - -`NNCF `__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -According to ``OVContrlNetStableDiffusionPipeline`` structure, -ControlNet and UNet are used in the cycle repeating inference on each -diffusion step, while other parts of pipeline take part only once. That -is why computation cost and speed of ControlNet and UNet become the -critical path in the pipeline. Quantizing the rest of the SD pipeline -does not significantly improve inference performance but can lead to a -substantial degradation of accuracy. - -The optimization process contains the following steps: - -1. Create a calibration dataset for quantization. -2. Run ``nncf.quantize()`` to obtain quantized model. -3. Save the ``INT8`` model using ``openvino.save_model()`` function. - -Please select below whether you would like to run quantization to -improve model inference speed. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - skip_for_device = "GPU" in device.value - to_quantize = quantization_widget(not skip_for_device) - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -Let’s load ``skip magic`` extension to skip quantization if -``to_quantize`` is not selected - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - import requests - - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - int8_pipe = None - - %load_ext skip_kernel_extension - -Prepare calibration datasets -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We use a prompts below as calibration data for ControlNet and UNet. To -collect intermediate model inputs for calibration we should customize -``CompiledModel``. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - text_prompts = [ - "a bilboard in NYC with a qrcode", - "a samurai side profile, realistic, 8K, fantasy", - "A sky view of a colorful lakes and rivers flowing through the desert", - "Bright sunshine coming through the cracks of a wet, cave wall of big rocks", - "A city view with clouds", - "A forest overlooking a mountain", - "Sky view of highly aesthetic, ancient greek thermal baths in beautiful nature", - "A dream-like futuristic city with the light trails of cars zipping through it's many streets", - ] - - negative_prompts = [ - "blurry unreal occluded", - "low contrast disfigured uncentered mangled", - "amateur out of frame low quality nsfw", - "ugly underexposed jpeg artifacts", - "low saturation disturbing content", - "overexposed severe distortion", - "amateur NSFW", - "ugly mutilated out of frame disfigured.", - ] - - qr_code_contents = [ - "Hugging Face", - "pre-trained diffusion model", - "image generation technique", - "control network", - "AI QR Code Generator", - "Explore NNCF today!", - "Join OpenVINO community", - "network compression", - ] - qrcode_images = [create_code(content) for content in qr_code_contents] - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from tqdm.notebook import tqdm - from transformers import set_seed - from typing import Any, Dict, List - - set_seed(1) - - num_inference_steps = 25 - - class CompiledModelDecorator(ov.CompiledModel): - def __init__(self, compiled_model, prob: float): - super().__init__(compiled_model) - self.data_cache = [] - self.prob = np.clip(prob, 0, 1) - - def __call__(self, *args, **kwargs): - if np.random.rand() >= self.prob: - self.data_cache.append(*args) - return super().__call__(*args, **kwargs) - - def collect_calibration_data(pipeline: OVContrlNetStableDiffusionPipeline, subset_size: int) -> List[Dict]: - original_unet = pipeline.unet - pipeline.unet = CompiledModelDecorator(original_unet, prob=0) - pipeline.set_progress_bar_config(disable=True) - - pbar = tqdm(total=subset_size) - diff = 0 - for prompt, qrcode_image, negative_prompt in zip(text_prompts, qrcode_images, negative_prompts): - _ = pipeline( - prompt, - qrcode_image, - negative_prompt=negative_prompt, - num_inference_steps=num_inference_steps, - ) - collected_subset_size = len(pipeline.unet.data_cache) - pbar.update(collected_subset_size - diff) - if collected_subset_size >= subset_size: - break - diff = collected_subset_size - - calibration_dataset = pipeline.unet.data_cache - pipeline.set_progress_bar_config(disable=False) - pipeline.unet = original_unet - return calibration_dataset - -.. code:: ipython3 - - %%skip not $to_quantize.value - - CONTROLNET_INT8_OV_PATH = Path("controlnet_int8.xml") - UNET_INT8_OV_PATH = Path("unet_int8.xml") - - if not (CONTROLNET_INT8_OV_PATH.exists() and UNET_INT8_OV_PATH.exists()): - subset_size = 200 - unet_calibration_data = collect_calibration_data(ov_pipe, subset_size=subset_size) - -The first three inputs of ControlNet are the same as the inputs of UNet, -the last ControlNet input is a preprocessed ``qrcode_image``. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - if not CONTROLNET_INT8_OV_PATH.exists(): - control_calibration_data = [] - prev_idx = 0 - for qrcode_image in qrcode_images: - preprocessed_image, _ = preprocess(qrcode_image) - for i in range(prev_idx, prev_idx + num_inference_steps): - control_calibration_data.append(unet_calibration_data[i][:3] + [preprocessed_image]) - prev_idx += num_inference_steps - -Run quantization -~~~~~~~~~~~~~~~~ - - - -Create a quantized model from the pre-trained converted OpenVINO model. -``FastBiasCorrection`` algorithm is disabled due to minimal accuracy -improvement in SD models and increased quantization time. - - **NOTE**: Quantization is time and memory consuming operation. - Running quantization code below may take some time. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import nncf - - if not UNET_INT8_OV_PATH.exists(): - unet = core.read_model(unet_ir_path) - quantized_unet = nncf.quantize( - model=unet, - calibration_dataset=nncf.Dataset(unet_calibration_data), - subset_size=subset_size, - model_type=nncf.ModelType.TRANSFORMER, - advanced_parameters=nncf.AdvancedQuantizationParameters( - disable_bias_correction=True - ) - ) - ov.save_model(quantized_unet, UNET_INT8_OV_PATH) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - if not CONTROLNET_INT8_OV_PATH.exists(): - controlnet = core.read_model(controlnet_ir_path) - quantized_controlnet = nncf.quantize( - model=controlnet, - calibration_dataset=nncf.Dataset(control_calibration_data), - subset_size=subset_size, - model_type=nncf.ModelType.TRANSFORMER, - advanced_parameters=nncf.AdvancedQuantizationParameters( - disable_bias_correction=True - ) - ) - ov.save_model(quantized_controlnet, CONTROLNET_INT8_OV_PATH) - -Let’s compare the images generated by the original and optimized -pipelines. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - np.random.seed(int(42)) - int8_pipe = OVContrlNetStableDiffusionPipeline(tokenizer, scheduler, core, CONTROLNET_INT8_OV_PATH, text_encoder_ir_path, UNET_INT8_OV_PATH, vae_ir_path, device=device.value) - - int8_image = int8_pipe( - "cozy town on snowy mountain slope 8k", - qrcode_image, - negative_prompt="blurry unreal occluded", - num_inference_steps=25, - guidance_scale=7.7, - controlnet_conditioning_scale=1.4 - )[0] - - -.. parsed-literal:: - - /home/ea/work/py311/lib/python3.11/site-packages/diffusers/configuration_utils.py:140: FutureWarning: Accessing config attribute `unet` directly via 'OVContrlNetStableDiffusionPipeline' object attribute is deprecated. Please access 'unet' over 'OVContrlNetStableDiffusionPipeline's config object instead, e.g. 'scheduler.config.unet'. - deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False) - /home/ea/work/py311/lib/python3.11/site-packages/diffusers/configuration_utils.py:140: FutureWarning: Accessing config attribute `controlnet` directly via 'OVContrlNetStableDiffusionPipeline' object attribute is deprecated. Please access 'controlnet' over 'OVContrlNetStableDiffusionPipeline's config object instead, e.g. 'scheduler.config.controlnet'. - deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False) - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import matplotlib.pyplot as plt - - def visualize_results(orig_img:Image.Image, optimized_img:Image.Image): - """ - Helper function for results visualization - - Parameters: - orig_img (Image.Image): generated image using FP16 models - optimized_img (Image.Image): generated image using quantized models - Returns: - fig (matplotlib.pyplot.Figure): matplotlib generated figure contains drawing result - """ - orig_title = "FP16 pipeline" - control_title = "INT8 pipeline" - figsize = (20, 20) - fig, axs = plt.subplots(1, 2, figsize=figsize, sharex='all', sharey='all') - list_axes = list(axs.flat) - for a in list_axes: - a.set_xticklabels([]) - a.set_yticklabels([]) - a.get_xaxis().set_visible(False) - a.get_yaxis().set_visible(False) - a.grid(False) - list_axes[0].imshow(np.array(orig_img)) - list_axes[1].imshow(np.array(optimized_img)) - list_axes[0].set_title(orig_title, fontsize=15) - list_axes[1].set_title(control_title, fontsize=15) - - fig.subplots_adjust(wspace=0.01, hspace=0.01) - fig.tight_layout() - return fig - -.. code:: ipython3 - - %%skip not $to_quantize.value - - fig = visualize_results(image, int8_image) - - - -.. image:: qrcode-monster-with-output_files/qrcode-monster-with-output_39_0.png - - -Compare model file sizes -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - fp16_ir_model_size = unet_ir_path.with_suffix(".bin").stat().st_size / 2**20 - quantized_model_size = UNET_INT8_OV_PATH.with_suffix(".bin").stat().st_size / 2**20 - - print(f"FP16 UNet size: {fp16_ir_model_size:.2f} MB") - print(f"INT8 UNet size: {quantized_model_size:.2f} MB") - print(f"UNet compression rate: {fp16_ir_model_size / quantized_model_size:.3f}") - - -.. parsed-literal:: - - FP16 UNet size: 1639.41 MB - INT8 UNet size: 821.79 MB - UNet compression rate: 1.995 - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - fp16_ir_model_size = controlnet_ir_path.with_suffix(".bin").stat().st_size / 2**20 - quantized_model_size = CONTROLNET_INT8_OV_PATH.with_suffix(".bin").stat().st_size / 2**20 - - print(f"FP16 ControlNet size: {fp16_ir_model_size:.2f} MB") - print(f"INT8 ControlNet size: {quantized_model_size:.2f} MB") - print(f"ControlNet compression rate: {fp16_ir_model_size / quantized_model_size:.3f}") - - -.. parsed-literal:: - - FP16 ControlNet size: 689.09 MB - INT8 ControlNet size: 345.55 MB - ControlNet compression rate: 1.994 - - -Compare inference time of the FP16 and INT8 pipelines -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To measure the inference performance of the ``FP16`` and ``INT8`` -pipelines, we use mean inference time on 3 samples. - - **NOTE**: For the most accurate performance estimation, it is - recommended to run ``benchmark_app`` in a terminal/command prompt - after closing other applications. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import time - - def calculate_inference_time(pipeline): - inference_time = [] - pipeline.set_progress_bar_config(disable=True) - for i in range(3): - prompt, qrcode_image = text_prompts[i], qrcode_images[i] - start = time.perf_counter() - _ = pipeline(prompt, qrcode_image, num_inference_steps=25) - end = time.perf_counter() - delta = end - start - inference_time.append(delta) - pipeline.set_progress_bar_config(disable=False) - return np.mean(inference_time) - -.. code:: ipython3 - - %%skip not $to_quantize.value - - fp_latency = calculate_inference_time(ov_pipe) - print(f"FP16 pipeline: {fp_latency:.3f} seconds") - int8_latency = calculate_inference_time(int8_pipe) - print(f"INT8 pipeline: {int8_latency:.3f} seconds") - print(f"Performance speed up: {fp_latency / int8_latency:.3f}") - - -.. parsed-literal:: - - /home/ea/work/py311/lib/python3.11/site-packages/diffusers/configuration_utils.py:140: FutureWarning: Accessing config attribute `controlnet` directly via 'OVContrlNetStableDiffusionPipeline' object attribute is deprecated. Please access 'controlnet' over 'OVContrlNetStableDiffusionPipeline's config object instead, e.g. 'scheduler.config.controlnet'. - deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False) - /home/ea/work/py311/lib/python3.11/site-packages/diffusers/configuration_utils.py:140: FutureWarning: Accessing config attribute `unet` directly via 'OVContrlNetStableDiffusionPipeline' object attribute is deprecated. Please access 'unet' over 'OVContrlNetStableDiffusionPipeline's config object instead, e.g. 'scheduler.config.unet'. - deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False) - - -.. parsed-literal:: - - FP16 pipeline: 176.250 seconds - INT8 pipeline: 119.885 seconds - Performance speed up: 1.470 - - -Running Text-to-Image Generation with ControlNet Conditioning and OpenVINO --------------------------------------------------------------------------- - - - -Now, we are ready to start generation. For improving the generation -process, we also introduce an opportunity to provide a -``negative prompt``. Technically, positive prompt steers the diffusion -toward the images associated with it, while negative prompt steers the -diffusion away from it. More explanation of how it works can be found in -this -`article `__. -We can keep this field empty if we want to generate image without -negative prompting. - -Please select below whether you would like to use the quantized model to -launch the interactive demo. - -.. code:: ipython3 - - import ipywidgets as widgets - - quantized_model_present = int8_pipe is not None - - use_quantized_model = widgets.Checkbox( - value=True if quantized_model_present else False, - description="Use quantized model", - disabled=not quantized_model_present, - ) - - use_quantized_model - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Use quantized model') - - - -.. code:: ipython3 - - import gradio as gr - - pipeline = int8_pipe if use_quantized_model.value else ov_pipe - - - def _generate( - qr_code_content: str, - prompt: str, - negative_prompt: str, - seed: Optional[int] = 42, - guidance_scale: float = 10.0, - controlnet_conditioning_scale: float = 2.0, - num_inference_steps: int = 5, - progress=gr.Progress(track_tqdm=True), - ): - if seed is not None: - np.random.seed(int(seed)) - qrcode_image = create_code(qr_code_content) - return pipeline( - prompt, - qrcode_image, - negative_prompt=negative_prompt, - num_inference_steps=int(num_inference_steps), - guidance_scale=guidance_scale, - controlnet_conditioning_scale=controlnet_conditioning_scale, - )[0] - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/qrcode-monster/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=_generate) - - try: - demo.queue().launch(debug=False) - except Exception: - demo.queue().launch(share=True, debug=False) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ diff --git a/docs/notebooks/qrcode-monster-with-output_files/qrcode-monster-with-output_22_1.jpg b/docs/notebooks/qrcode-monster-with-output_files/qrcode-monster-with-output_22_1.jpg deleted file mode 100644 index fb87b228ee8a7e..00000000000000 --- a/docs/notebooks/qrcode-monster-with-output_files/qrcode-monster-with-output_22_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:078d2fa182e69d86b8d06299ad1a0684445dd349ba17fedeb358b98b487d3ab5 -size 94722 diff --git a/docs/notebooks/qrcode-monster-with-output_files/qrcode-monster-with-output_22_1.png b/docs/notebooks/qrcode-monster-with-output_files/qrcode-monster-with-output_22_1.png deleted file mode 100644 index dd654ebfbd7d79..00000000000000 --- a/docs/notebooks/qrcode-monster-with-output_files/qrcode-monster-with-output_22_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:04f5e78cc379dd4325281dafd5d2104dce2aa125bc45fa7e431fa3aa501bba17 -size 951488 diff --git a/docs/notebooks/qrcode-monster-with-output_files/qrcode-monster-with-output_39_0.png b/docs/notebooks/qrcode-monster-with-output_files/qrcode-monster-with-output_39_0.png deleted file mode 100644 index c1626c2116146e..00000000000000 --- a/docs/notebooks/qrcode-monster-with-output_files/qrcode-monster-with-output_39_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f8ad96553676658c6f5d0ac52d1060b4f620350d8675d413bd5710a4c609c05d -size 2677155 diff --git a/docs/notebooks/qwen2-audio-with-output.rst b/docs/notebooks/qwen2-audio-with-output.rst deleted file mode 100644 index 5f24bb8ec57257..00000000000000 --- a/docs/notebooks/qwen2-audio-with-output.rst +++ /dev/null @@ -1,475 +0,0 @@ -Audio-language assistant with Qwen2Audio and OpenVINO -===================================================== - -Qwen2-Audio is the new series of Qwen large audio-language models. -Qwen2-Audio is capable of accepting various audio signal inputs and -performing audio analysis or direct textual responses with regard to -speech instructions. Model supports more than 8 languages and dialects, -e.g., Chinese, English, Cantonese, French, Italian, Spanish, German, and -Japanese and can work in two distinct audio interaction modes: \* -**voice chat**: users can freely engage in voice interactions with -Qwen2-Audio without text input; \* **audio analysis**: users could -provide audio and text instructions for analysis during the interaction; - -.. raw:: html - -

- -.. raw:: html - -

- -More details about model can be found in `model -card `__, -`blog `__, `original -repository `__ and `technical -report `__. - -In this tutorial we consider how to convert and optimize Qwen2Audio -model for creating multimodal chatbot. Additionally, we demonstrate how -to apply stateful transformation on LLM part and model optimization -techniques like weights compression using -`NNCF `__ - -**Table of contents:** - -- `Prerequisites <#prerequisites>`__ -- `Convert and Optimize model <#convert-and-optimize-model>`__ - - - `Compress model weights to - 4-bit <#compress-model-weights-to-4-bit>`__ - -- `Prepare model inference - pipeline <#prepare-model-inference-pipeline>`__ -- `Run model inference <#run-model-inference>`__ - - - `Voice chat <#voice-chat>`__ - - `Audio analysis <#audio-analysis>`__ - -- `Interactive Demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "transformers>=4.45" "torch>=2.1" "librosa" "gradio>=4.36" "modelscope-studio>=0.4.2" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -qU "openvino>=2024.4.0" "nncf>=2.13.0" - -.. code:: ipython3 - - from pathlib import Path - import requests - - if not Path("ov_qwen2_audio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/qwen2-audio/ov_qwen2_audio_helper.py") - open("ov_qwen2_audio_helper.py", "w").write(r.text) - - if not Path("notebook_utils.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("qwen2-audio.ipynb") - -Convert and Optimize model --------------------------- - - - -Qwen2Audio is PyTorch model. OpenVINO supports PyTorch models via -conversion to OpenVINO Intermediate Representation (IR). `OpenVINO model -conversion -API `__ -should be used for these purposes. ``ov.convert_model`` function accepts -original PyTorch model instance and example input for tracing and -returns ``ov.Model`` representing this model in OpenVINO framework. -Converted model can be used for saving on disk using ``ov.save_model`` -function or directly loading on device using ``core.compile_model``. -``ov_qwen2_audio_helper.py`` script contains helper function for model -conversion, please check its content if you interested in conversion -details. - -.. raw:: html - -

- -.. raw:: html - - - -Click here for more detailed explanation of conversion steps - -.. raw:: html - - - -Qwen2Audio is autoregressive transformer generative model, it means that -each next model step depends from model output from previous step. The -generation approach is based on the assumption that the probability -distribution of a word sequence can be decomposed into the product of -conditional next word distributions. In other words, model predicts the -next token in the loop guided by previously generated tokens until the -stop-condition will be not reached (generated sequence of maximum length -or end of string token obtained). The way the next token will be -selected over predicted probabilities is driven by the selected decoding -methodology. You can find more information about the most popular -decoding methods in this blog. The entry point for the generation -process for models from the Hugging Face Transformers library is the -``generate`` method. You can find more information about its parameters -and configuration in the documentation. To preserve flexibility in the -selection decoding methodology, we will convert only model inference for -one step. - -The inference flow has difference on first step and for the next. On the -first step, model accept audio and optionally input text instruction -and, that transformed to the unified embedding space using -``input_embedding`` and ``audio_encoder`` models, after that -``language model``, LLM-based part of model, runs on input embeddings to -predict probability of next generated tokens. On the next step, -``language_model`` accepts only next token id selected based on sampling -strategy and processed by ``input_embedding`` model and cached attention -key and values. Since the output side is auto-regressive, an output -token hidden state remains the same once computed for every further -generation step. Therefore, recomputing it every time you want to -generate a new token seems wasteful. With the cache, the model saves the -hidden state once it has been computed. The model only computes the one -for the most recently generated output token at each time step, re-using -the saved ones for hidden tokens. This reduces the generation complexity -from :math:`O(n^3)` to :math:`O(n^2)` for a transformer model. More -details about how it works can be found in this -`article `__. -To sum up above, model consists of 4 parts: - -- **Audio encoder** for encoding input audio into audio embedding space - and **Multi-modal projector** for transforming audio embeddings into - language model embedding space. -- **Input Embedding** for conversion input text tokens into embedding - space -- **Language Model** for generation answer based on input embeddings - provided by Audio Encoder and Input Embedding models. - -.. raw:: html - -
- -Compress model weights to 4-bit -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -For reducing memory -consumption, weights compression optimization can be applied using -`NNCF `__. - -.. raw:: html - -
- -.. raw:: html - - - -Click here for more details about weight compression - -.. raw:: html - - - -Weight compression aims to reduce the memory footprint of a model. It -can also lead to significant performance improvement for large -memory-bound models, such as Large Language Models (LLMs). LLMs and -other models, which require extensive memory to store the weights during -inference, can benefit from weight compression in the following ways: - -- enabling the inference of exceptionally large models that cannot be - accommodated in the memory of the device; - -- improving the inference performance of the models by reducing the - latency of the memory access when computing the operations with - weights, for example, Linear layers. - -`Neural Network Compression Framework -(NNCF) `__ provides 4-bit / -8-bit mixed weight quantization as a compression method primarily -designed to optimize LLMs. The main difference between weights -compression and full model quantization (post-training quantization) is -that activations remain floating-point in the case of weights -compression which leads to a better accuracy. Weight compression for -LLMs provides a solid inference performance improvement which is on par -with the performance of the full model quantization. In addition, weight -compression is data-free and does not require a calibration dataset, -making it easy to use. - -``nncf.compress_weights`` function can be used for performing weights -compression. The function accepts an OpenVINO model and other -compression parameters. Compared to INT8 compression, INT4 compression -improves performance even more, but introduces a minor drop in -prediction quality. - -More details about weights compression, can be found in `OpenVINO -documentation `__. - -.. raw:: html - -
- -.. code:: ipython3 - - pt_model_id = "Qwen/Qwen2-Audio-7B-Instruct" - - model_dir = Path(pt_model_id.split("/")[-1]) - -.. code:: ipython3 - - from ov_qwen2_audio_helper import convert_qwen2audio_model - - # uncomment these lines to see model conversion code - # convert_qwen2audio_model?? - -.. code:: ipython3 - - import nncf - - compression_configuration = { - "mode": nncf.CompressWeightsMode.INT4_ASYM, - "group_size": 128, - "ratio": 1.0, - } - - convert_qwen2audio_model(pt_model_id, model_dir, compression_configuration) - - -.. parsed-literal:: - - ✅ Qwen/Qwen2-Audio-7B-Instruct model already converted. You can find results in Qwen2-Audio-7B-Instruct - - -Prepare model inference pipeline --------------------------------- - - - -As discussed, the model comprises Image Encoder and LLM (with separated -text embedding part) that generates answer. In -``ov_qwen2_audio_helper.py`` we defined inference class -``OVQwen2AudioForConditionalGeneration`` that will represent generation -cycle, It is based on `HuggingFace Transformers -GenerationMixin `__ -and looks similar to `Optimum -Intel `__ -``OVModelForCausalLM`` that is used for LLM inference. - -.. code:: ipython3 - - from ov_qwen2_audio_helper import OVQwen2AudioForConditionalGeneration - - # Uncomment below lines to see the model inference class code - # OVQwen2AudioForConditionalGeneration?? - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget(default="AUTO", exclude=["NPU"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - ov_model = OVQwen2AudioForConditionalGeneration(model_dir, device.value) - -Run model inference -------------------- - - - -.. code:: ipython3 - - from transformers import AutoProcessor, TextStreamer - import librosa - import IPython.display as ipd - - - processor = AutoProcessor.from_pretrained(model_dir) - - audio_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac" - audio_chat_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav" - audio_file = Path(audio_url.split("/")[-1]) - audio_chat_file = Path(audio_chat_url.split("/")[-1]) - - if not audio_file.exists(): - r = requests.get(audio_url) - with audio_file.open("wb") as f: - f.write(r.content) - - if not audio_chat_file.exists(): - r = requests.get(audio_chat_url) - with audio_chat_file.open("wb") as f: - f.write(r.content) - -Voice chat -~~~~~~~~~~ - - - -.. code:: ipython3 - - conversation = [ - {"role": "system", "content": "You are a helpful assistant."}, - { - "role": "user", - "content": [ - {"type": "audio", "audio_url": audio_chat_url}, - ], - }, - ] - - text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) - audios = [librosa.load(audio_chat_file, sr=processor.feature_extractor.sampling_rate)[0]] - - inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True) - display(ipd.Audio(audio_chat_file)) - print("Answer:") - - generate_ids = ov_model.generate(**inputs, max_new_tokens=50, streamer=TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)) - - -.. parsed-literal:: - - It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug. - - - -.. raw:: html - - - - - - -.. parsed-literal:: - - Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation. - - -.. parsed-literal:: - - Answer: - Yes, I can guess that you are a female in your twenties. - - -Audio analysis -~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - question = "What does the person say?" - - conversation = [ - {"role": "system", "content": "You are a helpful assistant."}, - { - "role": "user", - "content": [ - {"type": "audio", "audio_url": audio_url}, - {"type": "text", "text": question}, - ], - }, - ] - - text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) - audios = [librosa.load(audio_file, sr=processor.feature_extractor.sampling_rate)[0]] - - inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True) - print("Question:") - print(question) - display(ipd.Audio(audio_file)) - print("Answer:") - - generate_ids = ov_model.generate(**inputs, max_new_tokens=50, streamer=TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)) - - -.. parsed-literal:: - - It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug. - - -.. parsed-literal:: - - Question: - What does the person say? - - - -.. raw:: html - - - - - - -.. parsed-literal:: - - Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation. - - -.. parsed-literal:: - - Answer: - The person says: 'Mister Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.' - - -Interactive Demo ----------------- - - - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/qwen2-vl/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - -.. code:: ipython3 - - from gradio_helper import make_demo - - - demo = make_demo(ov_model, processor) - - try: - demo.launch(debug=True) - except Exception: - demo.launch(debug=True, share=True) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/qwen2-vl-with-output.rst b/docs/notebooks/qwen2-vl-with-output.rst deleted file mode 100644 index c1dc23b7574c15..00000000000000 --- a/docs/notebooks/qwen2-vl-with-output.rst +++ /dev/null @@ -1,509 +0,0 @@ -Visual-language assistant with Qwen2VL and OpenVINO -=================================================== - -Qwen2VL is the latest addition to the QwenVL series of multimodal large -language models. - -**Key Enhancements of Qwen2VL:** \* **SoTA understanding of images of -various resolution & ratio**: Qwen2-VL achieves state-of-the-art -performance on visual understanding benchmarks, including MathVista, -DocVQA, RealWorldQA, MTVQA, etc. \* **Understanding videos of 20min+**: -Qwen2-VL can understand videos over 20 minutes for high-quality -video-based question answering, dialog, content creation, etc. \* -**Agent that can operate your mobiles, robots, etc.:** with the -abilities of complex reasoning and decision making, Qwen2-VL can be -integrated with devices like mobile phones, robots, etc., for automatic -operation based on visual environment and text instructions. \* -**Multilingual Support:** to serve global users, besides English and -Chinese, Qwen2-VL now supports the understanding of texts in different -languages inside images, including most European languages, Japanese, -Korean, Arabic, Vietnamese, etc. - -**Model Architecture Details:** - -- **Naive Dynamic Resolution**: Qwen2-VL can handle arbitrary image - resolutions, mapping them into a dynamic number of visual tokens, - offering a more human-like visual processing experience. - -.. raw:: html - -

- -.. raw:: html - -

- -- **Multimodal Rotary Position Embedding (M-ROPE)**: Decomposes - positional embedding into parts to capture 1D textual, 2D visual, and - 3D video positional information, enhancing its multimodal processing - capabilities. - -.. raw:: html - -

- -.. raw:: html - -

- -More details about model can be found in `model -card `__, -`blog `__ and original -`repo `__. - -In this tutorial we consider how to convert and optimize Qwen2VL model -for creating multimodal chatbot using `Optimum -Intel `__. Additionally, -we demonstrate how to apply model optimization techniques like weights -compression using `NNCF `__ - -**Table of contents:** - -- `Prerequisites <#prerequisites>`__ -- `Select model <#select-model>`__ -- `Convert and Optimize model <#convert-and-optimize-model>`__ - - - `Compress model weights to - 4-bit <#compress-model-weights-to-4-bit>`__ - -- `Prepare model inference - pipeline <#prepare-model-inference-pipeline>`__ - - - `Select inference device <#select-inference-device>`__ - -- `Run model inference <#run-model-inference>`__ -- `Interactive Demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "transformers>=4.45" "torch>=2.1" "torchvision" "qwen-vl-utils" "Pillow" "gradio>=4.36" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q -U "openvino>=2024.6.0" "openvino-tokenizers>=2024.6.0" "nncf>=2.14.0" - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu - -.. code:: ipython3 - - from pathlib import Path - import requests - - if not Path("cmd_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py") - open("cmd_helper.py", "w").write(r.text) - - if not Path("notebook_utils.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("qwen2-vl.ipynb") - -Select model ------------- - - - -There are multiple Qwen2VL models available in `models -collection `__. -You can select one of them for conversion and optimization in notebook -using widget bellow: - -.. code:: ipython3 - - import ipywidgets as widgets - - model_ids = ["Qwen/Qwen2-VL-2B-Instruct", "Qwen/Qwen2-VL-7B-Instruct"] - - model_id = widgets.Dropdown( - options=model_ids, - default=model_ids[0], - description="Model:", - ) - - model_id - - - - -.. parsed-literal:: - - Dropdown(description='Model:', options=('Qwen/Qwen2-VL-2B-Instruct', 'Qwen/Qwen2-VL-7B-Instruct'), value='Qwen… - - - -.. code:: ipython3 - - print(f"Selected {model_id.value}") - pt_model_id = model_id.value - model_dir = Path(pt_model_id.split("/")[-1]) - - -.. parsed-literal:: - - Selected Qwen/Qwen2-VL-2B-Instruct - - -Convert and Optimize model --------------------------- - - - -Qwen2VL is PyTorch model. OpenVINO supports PyTorch models via -conversion to OpenVINO Intermediate Representation (IR). `OpenVINO model -conversion -API `__ -should be used for these purposes. ``ov.convert_model`` function accepts -original PyTorch model instance and example input for tracing and -returns ``ov.Model`` representing this model in OpenVINO framework. -Converted model can be used for saving on disk using ``ov.save_model`` -function or directly loading on device using ``core.compile_model``. - -For convenience, we will use OpenVINO integration with HuggingFace -Optimum. `Optimum -Intel `__ is the -interface between the Transformers and Diffusers libraries and the -different tools and libraries provided by Intel to accelerate end-to-end -pipelines on Intel architectures. - -Among other use cases, Optimum Intel provides a simple interface to -optimize your Transformers and Diffusers models, convert them to the -OpenVINO Intermediate Representation (IR) format and run inference using -OpenVINO Runtime. ``optimum-cli`` provides command line interface for -model conversion and optimization. - -General command format: - -.. code:: bash - - optimum-cli export openvino --model --task - -where task is task to export the model for, if not specified, the task -will be auto-inferred based on the model. You can find a mapping between -tasks and model classes in Optimum TaskManager -`documentation `__. -Additionally, you can specify weights compression using -``--weight-format`` argument with one of following options: ``fp32``, -``fp16``, ``int8`` and ``int4``. For int8 and int4 -`nncf `__ will be used for -weight compression. More details about model export provided in `Optimum -Intel -documentation `__. - -Compress model weights to 4-bit -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -For reducing memory -consumption, weights compression optimization can be applied using -`NNCF `__. - -.. raw:: html - -

- -.. raw:: html - - - -Click here for more details about weight compression - -.. raw:: html - - - -Weight compression aims to reduce the memory footprint of a model. It -can also lead to significant performance improvement for large -memory-bound models, such as Large Language Models (LLMs). LLMs and -other models, which require extensive memory to store the weights during -inference, can benefit from weight compression in the following ways: - -- enabling the inference of exceptionally large models that cannot be - accommodated in the memory of the device; - -- improving the inference performance of the models by reducing the - latency of the memory access when computing the operations with - weights, for example, Linear layers. - -`Neural Network Compression Framework -(NNCF) `__ provides 4-bit / -8-bit mixed weight quantization as a compression method primarily -designed to optimize LLMs. The main difference between weights -compression and full model quantization (post-training quantization) is -that activations remain floating-point in the case of weights -compression which leads to a better accuracy. Weight compression for -LLMs provides a solid inference performance improvement which is on par -with the performance of the full model quantization. In addition, weight -compression is data-free and does not require a calibration dataset, -making it easy to use. - -``nncf.compress_weights`` function can be used for performing weights -compression. The function accepts an OpenVINO model and other -compression parameters. Compared to INT8 compression, INT4 compression -improves performance even more, but introduces a minor drop in -prediction quality. - -More details about weights compression, can be found in `OpenVINO -documentation `__. - -.. raw:: html - -
- -.. code:: ipython3 - - from cmd_helper import optimum_cli - - if not (model_dir / "INT4").exists(): - optimum_cli(pt_model_id, model_dir / "INT4", additional_args={"weight-format": "int4"}) - - - -**Export command:** - - - -``optimum-cli export openvino --model Qwen/Qwen2-VL-2B-Instruct Qwen2-VL-2B-Instruct/INT4 --weight-format int4`` - - -.. parsed-literal:: - - 2024-12-24 18:27:51.174286: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-12-24 18:27:51.186686: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered - WARNING: All log messages before absl::InitializeLog() is called are written to STDERR - E0000 00:00:1735050471.201093 340500 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered - E0000 00:00:1735050471.205249 340500 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered - 2024-12-24 18:27:51.219846: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - Downloading shards: 100%|██████████| 2/2 [00:00<00:00, 2.73it/s] - `Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46 - Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00, 1.46s/it] - `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. - /home/ea/work/py311/lib/python3.11/site-packages/transformers/cache_utils.py:458: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. - or len(self.key_cache[layer_idx]) == 0 # the layer has no cache - /home/ea/work/py311/lib/python3.11/site-packages/transformers/modeling_attn_mask_utils.py:281: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - elif sliding_window is None or key_value_length < sliding_window: - /home/ea/work/py311/lib/python3.11/site-packages/transformers/models/qwen2_vl/modeling_qwen2_vl.py:1329: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if attention_mask.shape[-1] > target_length: - /home/ea/work/py311/lib/python3.11/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. - elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Weight compression mode │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ int8_asym │ 15% (1 / 197) │ 0% (0 / 196) │ - ├───────────────────────────┼─────────────────────────────┼────────────────────────────────────────┤ - │ int4_asym │ 85% (196 / 197) │ 100% (196 / 196) │ - ┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 0:00:45 • 0:00:00;0;104;181m0:00:01181m0:00:02 - [?25hINFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Weight compression mode │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ int8_sym │ 100% (1 / 1) │ 100% (1 / 1) │ - ┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 0:00:00 • 0:00:00 - [?25hINFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Weight compression mode │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ int8_sym │ 100% (1 / 1) │ 100% (1 / 1) │ - ┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 0:00:01 • 0:00:00 - [?25hINFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Weight compression mode │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ int8_sym │ 100% (130 / 130) │ 100% (130 / 130) │ - ┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 0:00:03 • 0:00:0002 • 0:00:01 - [?25h - -Prepare model inference pipeline --------------------------------- - - - -OpenVINO integration with Optimum Intel provides ready-to-use API for -model inference that can be used for smooth integration with -transformers-based solutions. For loading model, we will use -``OVModelForVisualCausalLM`` class that have compatible interface with -Transformers LLaVA implementation. For loading a model, -``from_pretrained`` method should be used. It accepts path to the model -directory or model_id from HuggingFace hub (if model is not converted to -OpenVINO format, conversion will be triggered automatically). -Additionally, we can provide an inference device, quantization config -(if model has not been quantized yet) and device-specific OpenVINO -Runtime configuration. More details about model inference with Optimum -Intel can be found in -`documentation `__. - -.. code:: ipython3 - - from optimum.intel.openvino import OVModelForVisualCausalLM - - -.. parsed-literal:: - - 2024-12-24 18:30:03.136274: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-12-24 18:30:03.148865: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered - WARNING: All log messages before absl::InitializeLog() is called are written to STDERR - E0000 00:00:1735050603.163311 340474 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered - E0000 00:00:1735050603.167677 340474 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered - 2024-12-24 18:30:03.182551: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget(default="AUTO", exclude=["NPU"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - model = OVModelForVisualCausalLM.from_pretrained(model_dir / "INT4", device.value) - - -.. parsed-literal:: - - Could not infer whether the model was already converted or not to the OpenVINO IR, keeping `export=AUTO`. - unsupported operand type(s) for ^: 'bool' and 'str' - - -Run model inference -------------------- - - - -.. code:: ipython3 - - from PIL import Image - from transformers import AutoProcessor, AutoTokenizer - from qwen_vl_utils import process_vision_info - from transformers import TextStreamer - - - min_pixels = 256 * 28 * 28 - max_pixels = 1280 * 28 * 28 - processor = AutoProcessor.from_pretrained(model_dir / "INT4", min_pixels=min_pixels, max_pixels=max_pixels) - - if processor.chat_template is None: - tok = AutoTokenizer.from_pretrained(model_dir) - processor.chat_template = tok.chat_template - - example_image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" - example_image_path = Path("demo.jpeg") - - if not example_image_path.exists(): - Image.open(requests.get(example_image_url, stream=True).raw).save(example_image_path) - - image = Image.open(example_image_path) - question = "Describe this image." - - messages = [ - { - "role": "user", - "content": [ - { - "type": "image", - "image": f"file://{example_image_path}", - }, - {"type": "text", "text": question}, - ], - } - ] - - # Preparation for inference - text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - image_inputs, video_inputs = process_vision_info(messages) - inputs = processor( - text=[text], - images=image_inputs, - videos=video_inputs, - padding=True, - return_tensors="pt", - ) - - display(image) - print("Question:") - print(question) - print("Answer:") - - generated_ids = model.generate(**inputs, max_new_tokens=100, streamer=TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)) - - - -.. image:: qwen2-vl-with-output_files/qwen2-vl-with-output_15_0.png - - -.. parsed-literal:: - - Question: - Describe this image. - Answer: - The image depicts a woman sitting on a sandy beach with a large dog. The dog is wearing a harness and is sitting on its hind legs, reaching up to give a high-five to the woman. The woman is smiling and appears to be enjoying the moment. The background shows the ocean with gentle waves, and the sky is clear with a soft light, suggesting it might be either sunrise or sunset. - - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/qwen2-vl/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - -Interactive Demo ----------------- - - - -Now, you can try to chat with model. Upload image or video using -``Upload`` button, provide your text message into ``Input`` field and -click ``Submit`` to start communication. - -.. code:: ipython3 - - from gradio_helper import make_demo - - - demo = make_demo(model, processor) - - try: - demo.launch(debug=True) - except Exception: - demo.launch(debug=True, share=True) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/qwen2-vl-with-output_files/qwen2-vl-with-output_15_0.jpg b/docs/notebooks/qwen2-vl-with-output_files/qwen2-vl-with-output_15_0.jpg deleted file mode 100644 index 0788fe5c9e538f..00000000000000 --- a/docs/notebooks/qwen2-vl-with-output_files/qwen2-vl-with-output_15_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d572a157b0327bc3008c2b36b36ec3192212815b2a1c2f7c5f5831089f3fa27f -size 237266 diff --git a/docs/notebooks/qwen2-vl-with-output_files/qwen2-vl-with-output_15_0.png b/docs/notebooks/qwen2-vl-with-output_files/qwen2-vl-with-output_15_0.png deleted file mode 100644 index edf56b758dabe4..00000000000000 --- a/docs/notebooks/qwen2-vl-with-output_files/qwen2-vl-with-output_15_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:00662531da0f0951d986abd8bd4a7a147ae8a1db5ef5f3af737b542008fc02bb -size 2164552 diff --git a/docs/notebooks/riffusion-text-to-music-with-output.rst b/docs/notebooks/riffusion-text-to-music-with-output.rst deleted file mode 100644 index 8805ebe18698b3..00000000000000 --- a/docs/notebooks/riffusion-text-to-music-with-output.rst +++ /dev/null @@ -1,557 +0,0 @@ -Text-to-Music generation using Riffusion and OpenVINO -===================================================== - -`Riffusion `__ is a -latent text-to-image diffusion model capable of generating spectrogram -images given any text input. These spectrograms can be converted into -audio clips. General diffusion models are machine learning systems that -are trained to denoise random Gaussian noise step by step, to get to a -sample of interest, such as an image. Diffusion models have been shown -to achieve state-of-the-art results for generating image data. But one -downside of diffusion models is that the reverse denoising process is -slow. In addition, these models consume a lot of memory because they -operate in pixel space, which becomes unreasonably expensive when -generating high-resolution images. Therefore, it is challenging to train -these models and also use them for inference. OpenVINO brings -capabilities to run model inference on Intel hardware and opens the door -to the fantastic world of diffusion models for everyone! - -In this tutorial, we consider how to run a text-to-music generation -pipeline using Riffusion and OpenVINO. We will use a pre-trained model -from the `Diffusers `__ -library. To simplify the user experience, the `Hugging Face Optimum -Intel `__ library is -used to convert the models to OpenVINO™ IR format. - -The tutorial consists of the following steps: - -- Install prerequisites -- Download and convert the model from a public source using the - `OpenVINO integration with Hugging Face - Optimum `__. -- Create a text-to-music inference pipeline -- Run inference pipeline - -About Riffusion ---------------- - -Riffusion is based on Stable Diffusion v1.5 and fine-tuned on images of -spectrogram paired with text. Audio processing happens downstream of the -model. This model can generate an audio spectrogram for given input -text. - -An audio spectrogram is a visual way to represent the frequency content -of a sound clip. The x-axis represents time, and the y-axis represents -frequency. The color of each pixel gives the amplitude of the audio at -the frequency and time given by its row and column. An audio -`spectrogram `__ is a visual -way to represent the frequency content of a sound clip. The x-axis -represents time, and the y-axis represents frequency. The color of each -pixel gives the amplitude of the audio at the frequency and time given -by its row and column. - -.. figure:: https://www.riffusion.com/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Fspectrogram_label.8c8aea56.png&w=1920&q=75 - :alt: spectrogram - - spectrogram - -`\*image source `__ - -The spectrogram can be computed from audio using the `Short-time Fourier -transform -(STFT) `__, -which approximates the audio as a combination of sine waves of varying -amplitudes and phases. - -.. figure:: https://www.riffusion.com/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ffourier_transform.9d53f3d7.png&w=640&q=75 - :alt: stft.png - - stft.png - -`\*image source `__ - -The STFT is invertible, so the original audio can be reconstructed from -a spectrogram. This idea is a behind approach to using Riffusion for -audio generation. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Stable Diffusion pipeline in Optimum - Intel <#stable-diffusion-pipeline-in-optimum-intel>`__ - - - `Select inference device <#select-inference-device>`__ - -- `Prepare postprocessing for reconstruction audio from spectrogram - image <#prepare-postprocessing-for-reconstruction-audio-from-spectrogram-image>`__ -- `Run Inference pipeline <#run-inference-pipeline>`__ -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu Pillow scipy "torch>=2.1" torchaudio "diffusers>=0.16.1" "transformers>=4.33.0" - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" "gradio>=3.34.0" "openvino>=2023.1.0" - -Stable Diffusion pipeline in Optimum Intel ------------------------------------------- - - - -As the riffusion model architecture is the same as Stable Diffusion, we -can use it with the Stable Diffusion pipeline for text-to-image -generation. `Optimum -Intel `__ can be -used to load optimized models from the Hugging Face Hub and create -pipelines to run an inference with OpenVINO Runtime without rewriting -APIs. When Stable Diffusion models are exported to the OpenVINO format, -they are decomposed into three components that consist of four models -combined during inference into the pipeline: - -- The text encoder -- The U-NET -- The VAE encoder -- The VAE decoder - -More details about the Stable Diffusion pipeline can be found in -`stable-diffusion `__ -notebook. - -For running the Stable Diffusion model with Optimum Intel, we should use -the ``optimum.intel.OVStableDiffusionPipeline`` class, which represents -the inference pipeline. ``OVStableDiffusionPipeline`` initialized by the -``from_pretrained`` method. It supports on-the-fly conversion models -from PyTorch using the ``export=True`` parameter. A converted model can -be saved on disk using the ``save_pretrained`` method for the next -running. - -.. code:: ipython3 - - from pathlib import Path - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("riffusion-text-to-music.ipynb") - - MODEL_ID = "riffusion/riffusion-model-v1" - MODEL_DIR = Path("riffusion_pipeline") - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - from optimum.intel.openvino import OVStableDiffusionPipeline - - DEVICE = device.value - - if not MODEL_DIR.exists(): - pipe = OVStableDiffusionPipeline.from_pretrained(MODEL_ID, export=True, device=DEVICE, compile=False) - pipe.half() - pipe.save_pretrained(MODEL_DIR) - else: - pipe = OVStableDiffusionPipeline.from_pretrained(MODEL_DIR, device=DEVICE, compile=False) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -.. parsed-literal:: - - No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda' - 2023-09-19 18:21:08.176653: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2023-09-19 18:21:08.217600: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2023-09-19 18:21:08.865600: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - /home/ea/work/ov_venv/lib/python3.8/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations - warnings.warn( - - -Prepare postprocessing for reconstruction audio from spectrogram image ----------------------------------------------------------------------- - - - -The riffusion model generates an audio spectrogram image, which can be -used to reconstruct audio. However, the spectrogram images from the -model only contain the amplitude of the sine waves and not the phases, -because the phases are chaotic and hard to learn. Instead, we can use -the `Griffin-Lim `__ -algorithm to approximate the phase when reconstructing the audio clip. -The Griffin-Lim Algorithm (GLA) is a phase reconstruction method based -on the redundancy of the `Short-time Fourier transform -(STFT) `__. -It promotes the consistency of a spectrogram by iterating two -projections, where a spectrogram is said to be consistent when its -inter-bin dependency owing to the redundancy of STFT is retained. GLA is -based only on consistency and does not take any prior knowledge about -the target signal into account. - -The frequency bins in generated spectrogram use the `Mel -scale `__, which is a -perceptual scale of pitches judged by listeners to be equal in distance -from one another. - -The code below defines the process of reconstruction of a WAV audio clip -from a spectrogram image using Griffin-Lim Algorithm. - -.. code:: ipython3 - - import io - from typing import Tuple - - import numpy as np - from PIL import Image - from scipy.io import wavfile - import torch - import torchaudio - - - def wav_bytes_from_spectrogram_image(image: Image.Image) -> Tuple[io.BytesIO, float]: - """ - Reconstruct a WAV audio clip from a spectrogram image. Also returns the duration in seconds. - - Parameters: - image (Image.Image): generated spectrogram image - Returns: - wav_bytes (io.BytesIO): audio signal encoded in wav bytes - duration_s (float): duration in seconds - """ - - max_volume = 50 - power_for_image = 0.25 - Sxx = spectrogram_from_image(image, max_volume=max_volume, power_for_image=power_for_image) - - sample_rate = 44100 # [Hz] - clip_duration_ms = 5000 # [ms] - - bins_per_image = 512 - n_mels = 512 - - # FFT parameters - window_duration_ms = 100 # [ms] - padded_duration_ms = 400 # [ms] - step_size_ms = 10 # [ms] - - # Derived parameters - num_samples = int(image.width / float(bins_per_image) * clip_duration_ms) * sample_rate - n_fft = int(padded_duration_ms / 1000.0 * sample_rate) - hop_length = int(step_size_ms / 1000.0 * sample_rate) - win_length = int(window_duration_ms / 1000.0 * sample_rate) - - samples = waveform_from_spectrogram( - Sxx=Sxx, - n_fft=n_fft, - hop_length=hop_length, - win_length=win_length, - num_samples=num_samples, - sample_rate=sample_rate, - mel_scale=True, - n_mels=n_mels, - num_griffin_lim_iters=32, - ) - - wav_bytes = io.BytesIO() - wavfile.write(wav_bytes, sample_rate, samples.astype(np.int16)) - wav_bytes.seek(0) - - duration_s = float(len(samples)) / sample_rate - - return wav_bytes, duration_s - - - def spectrogram_from_image(image: Image.Image, max_volume: float = 50, power_for_image: float = 0.25) -> np.ndarray: - """ - Compute a spectrogram magnitude array from a spectrogram image. - - Parameters: - image (image.Image): input image - max_volume (float, *optional*, 50): max volume for spectrogram magnitude - power_for_image (float, *optional*, 0.25): power for reversing power curve - """ - # Convert to a numpy array of floats - data = np.array(image).astype(np.float32) - - # Flip Y take a single channel - data = data[::-1, :, 0] - - # Invert - data = 255 - data - - # Rescale to max volume - data = data * max_volume / 255 - - # Reverse the power curve - data = np.power(data, 1 / power_for_image) - - return data - - - def waveform_from_spectrogram( - Sxx: np.ndarray, - n_fft: int, - hop_length: int, - win_length: int, - num_samples: int, - sample_rate: int, - mel_scale: bool = True, - n_mels: int = 512, - num_griffin_lim_iters: int = 32, - device: str = "cpu", - ) -> np.ndarray: - """ - Reconstruct a waveform from a spectrogram. - This is an approximate waveform, using the Griffin-Lim algorithm - to approximate the phase. - """ - Sxx_torch = torch.from_numpy(Sxx).to(device) - - if mel_scale: - mel_inv_scaler = torchaudio.transforms.InverseMelScale( - n_mels=n_mels, - sample_rate=sample_rate, - f_min=0, - f_max=10000, - n_stft=n_fft // 2 + 1, - norm=None, - mel_scale="htk", - ).to(device) - - Sxx_torch = mel_inv_scaler(Sxx_torch) - - griffin_lim = torchaudio.transforms.GriffinLim( - n_fft=n_fft, - win_length=win_length, - hop_length=hop_length, - power=1.0, - n_iter=num_griffin_lim_iters, - ).to(device) - - waveform = griffin_lim(Sxx_torch).cpu().numpy() - - return waveform - -Run Inference pipeline ----------------------- - - - -The diagram below briefly describes the workflow of our pipeline - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/3de12c6b-23ef-4953-aeda-5785108990b9 - :alt: riffusion_pipeline.png - - riffusion_pipeline.png - -As you can see, it is very similar to Stable Diffusion Text-to-Image -generation with an additional post-processing step that transforms -generated spectrogram into an audio signal. Firstly, -``OVStableDiffusionPipeline`` accepts input text prompt, which will be -tokenized and transformed to embeddings space using Frozen CLIP text -encoder and generates initial latent spectrogram representation using a -random generator, then U-Net iteratively *denoises* the random latent -spectrogram image representations while being conditioned on the text -embeddings. The output of the U-Net, being the noise residual, is used -to compute a denoised latent image representation via a scheduler -algorithm. The *denoising* process is repeated a given number of times -(by default 50) to step-by-step retrieve better latent image -representations. When complete, the latent image representation is -decoded by the decoder part of the variational auto-encoder. Generated -spectrogram image will be converted into a spectrogram magnitude range -and inverse mel scale applied to it to estimate an STFT in the normal -frequency domain from the mel frequency domain. Finally, Griffin-Lim -Algorithm approximates the phase of an audio signal and we got -reconstructed audio. - -.. code:: ipython3 - - pipe.reshape(batch_size=1, height=512, width=512, num_images_per_prompt=1) - pipe.compile() - - - def generate(prompt: str, negative_prompt: str = "") -> Tuple[Image.Image, str]: - """ - function for generation audio from text prompt - - Parameters: - prompt (str): input prompt for generation. - negative_prompt (str): negative prompt for generation, contains undesired concepts for generation, which should be avoided. Can be empty. - Returns: - spec (Image.Image) - generated spectrogram image - """ - spec = pipe(prompt, negative_prompt=negative_prompt, num_inference_steps=20).images[0] - wav = wav_bytes_from_spectrogram_image(spec) - with open("output.wav", "wb") as f: - f.write(wav[0].getbuffer()) - return spec, "output.wav" - - -.. parsed-literal:: - - Compiling the vae_decoder... - Compiling the unet... - Compiling the vae_encoder... - Compiling the text_encoder... - - -Now, we can test our generation. Function generate accepts text input -and returns generated spectrogram and path to generated audio. -Optionally, it also accepts negative prompt. A negative prompt declares -undesired concepts for generation, e.g. if we want to generate -instrumental music, having vocal on audio will be an unwanted effect, so -in this case vocal can be treated as a negative prompt. The positive and -negative prompts are in equal footing. You can always use one with or -without the other. More explanation of how it works can be found in this -`article `__. - -.. code:: ipython3 - - spectrogram, wav_path = generate("Techno beat") - - -.. parsed-literal:: - - `height` was set to 256 but the static model will output images of height 512.To fix the height, please reshape your model accordingly using the `.reshape()` method. - `width` was set to 256 but the static model will output images of width 512.To fix the width, please reshape your model accordingly using the `.reshape()` method. - /home/ea/work/ov_venv/lib/python3.8/site-packages/optimum/intel/openvino/modeling_diffusion.py:559: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly. - outputs = self.request(inputs, shared_memory=True) - - - -.. parsed-literal:: - - 0%| | 0/21 [00:00 - - Your browser does not support the audio element. - - - - - -Interactive demo ----------------- - - - -.. code:: ipython3 - - import gradio as gr - - - def select_device(device_str: str, current_text: str = "", progress: gr.Progress = gr.Progress()): - """ - Helper function for uploading model on the device. - - Parameters: - device_str (str): Device name. - current_text (str): Current content of user instruction field (used only for backup purposes, temporally replacing it on the progress bar during model loading). - progress (gr.Progress): gradio progress tracker - Returns: - current_text - """ - if device_str != pipe._device: - pipe.clear_requests() - pipe.to(device_str) - - for i in progress.tqdm(range(1), desc=f"Model loading on {device_str}"): - pipe.compile() - return current_text - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/riffusion-text-to-music/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(generate_fn=generate, select_device_fn=select_device) - - try: - demo.queue().launch(debug=False, height=800) - except Exception: - demo.queue().launch(debug=False, share=True, height=800) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ diff --git a/docs/notebooks/riffusion-text-to-music-with-output_files/riffusion-text-to-music-with-output_14_0.jpg b/docs/notebooks/riffusion-text-to-music-with-output_files/riffusion-text-to-music-with-output_14_0.jpg deleted file mode 100644 index 4a09c8bc4f4b96..00000000000000 --- a/docs/notebooks/riffusion-text-to-music-with-output_files/riffusion-text-to-music-with-output_14_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:94a31b8b6658c858732a376cb99705c80ef2b851ae60b17582cca830b9095954 -size 61095 diff --git a/docs/notebooks/riffusion-text-to-music-with-output_files/riffusion-text-to-music-with-output_14_0.png b/docs/notebooks/riffusion-text-to-music-with-output_files/riffusion-text-to-music-with-output_14_0.png deleted file mode 100644 index f73070696526be..00000000000000 --- a/docs/notebooks/riffusion-text-to-music-with-output_files/riffusion-text-to-music-with-output_14_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:97f983a7505aaca44b18d477608d8859f85e4fbc41f80216986b41e8f8115a55 -size 524399 diff --git a/docs/notebooks/rmbg-background-removal-with-output.rst b/docs/notebooks/rmbg-background-removal-with-output.rst deleted file mode 100644 index 63bf24d7dc08bd..00000000000000 --- a/docs/notebooks/rmbg-background-removal-with-output.rst +++ /dev/null @@ -1,318 +0,0 @@ -Background removal with RMBG v1.4 and OpenVINO -============================================== - -Background matting is the process of accurately estimating the -foreground object in images and videos. It is a very important technique -in image and video editing applications, particularly in film production -for creating visual effects. In case of image segmentation, we segment -the image into foreground and background by labeling the pixels. Image -segmentation generates a binary image, in which a pixel either belongs -to foreground or background. However, Image Matting is different from -the image segmentation, wherein some pixels may belong to foreground as -well as background, such pixels are called partial or mixed pixels. In -order to fully separate the foreground from the background in an image, -accurate estimation of the alpha values for partial or mixed pixels is -necessary. - -RMBG v1.4 is background removal model, designed to effectively separate -foreground from background in a range of categories and image types. -This model has been trained on a carefully selected dataset, which -includes: general stock images, e-commerce, gaming, and advertising -content, making it suitable for commercial use cases powering enterprise -content creation at scale. The accuracy, efficiency, and versatility -currently rival leading source-available models. - -More details about model can be found in `model -card `__. - -In this tutorial we consider how to convert and run this model using -OpenVINO. - -**Table of contents:** - -- `Prerequisites <#prerequisites>`__ -- `Load PyTorch model <#load-pytorch-model>`__ -- `Run PyTorch model inference <#run-pytorch-model-inference>`__ -- `Convert Model to OpenVINO Intermediate Representation - format <#convert-model-to-openvino-intermediate-representation-format>`__ -- `Run OpenVINO model inference <#run-openvino-model-inference>`__ -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -install required dependencies - -.. code:: ipython3 - - %pip install -q torch torchvision pillow huggingface_hub "openvino>=2024.0.0" "matplotlib>=3.4" "gradio>=4.15" "transformers>=4.39.1" tqdm --extra-index-url https://download.pytorch.org/whl/cpu - -Download model code from HuggingFace hub - -.. code:: ipython3 - - from huggingface_hub import hf_hub_download - from pathlib import Path - import requests - - repo_id = "briaai/RMBG-1.4" - - download_files = ["utilities.py", "example_input.jpg"] - - for file_for_downloading in download_files: - if not Path(file_for_downloading).exists(): - hf_hub_download(repo_id=repo_id, filename=file_for_downloading, local_dir=".") - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("rmbg-background-removal.ipynb") - -Load PyTorch model ------------------- - - - -For loading model using PyTorch, we should use -``AutoModelForImageSegmentation.from_pretrained`` method. Model weights -will be downloaded automatically during first model usage. Please, note, -it may take some time. - -.. code:: ipython3 - - from transformers import AutoModelForImageSegmentation - - net = AutoModelForImageSegmentation.from_pretrained("briaai/RMBG-1.4", trust_remote_code=True) - -Run PyTorch model inference ---------------------------- - - - -``preprocess_image`` function is responsible for preparing input data in -model-specific format. ``postprocess_image`` function is responsible for -postprocessing model output. After postprocessing, generated background -mask can be inserted into original image as alpha-channel. - -.. code:: ipython3 - - import torch - from PIL import Image - from utilities import preprocess_image, postprocess_image - import numpy as np - from matplotlib import pyplot as plt - - - def visualize_result(orig_img: Image, mask: Image, result_img: Image): - """ - Helper for results visualization - - parameters: - orig_img (Image): input image - mask (Image): background mask - result_img (Image) output image - returns: - plt.Figure: plot with 3 images for visualization - """ - titles = ["Original", "Background Mask", "Without background"] - im_w, im_h = orig_img.size - is_horizontal = im_h <= im_w - figsize = (20, 20) - num_images = 3 - fig, axs = plt.subplots( - num_images if is_horizontal else 1, - 1 if is_horizontal else num_images, - figsize=figsize, - sharex="all", - sharey="all", - ) - fig.patch.set_facecolor("white") - list_axes = list(axs.flat) - for a in list_axes: - a.set_xticklabels([]) - a.set_yticklabels([]) - a.get_xaxis().set_visible(False) - a.get_yaxis().set_visible(False) - a.grid(False) - list_axes[0].imshow(np.array(orig_img)) - list_axes[1].imshow(np.array(mask), cmap="gray") - list_axes[0].set_title(titles[0], fontsize=15) - list_axes[1].set_title(titles[1], fontsize=15) - list_axes[2].imshow(np.array(result_img)) - list_axes[2].set_title(titles[2], fontsize=15) - - fig.subplots_adjust(wspace=0.01 if is_horizontal else 0.00, hspace=0.01 if is_horizontal else 0.1) - fig.tight_layout() - return fig - - - im_path = "./example_input.jpg" - - # prepare input - model_input_size = [1024, 1024] - orig_im = np.array(Image.open(im_path)) - orig_im_size = orig_im.shape[0:2] - image = preprocess_image(orig_im, model_input_size) - - # inference - result = net(image) - - # post process - result_image = postprocess_image(result[0][0], orig_im_size) - - # save result - pil_im = Image.fromarray(result_image) - no_bg_image = Image.new("RGBA", pil_im.size, (0, 0, 0, 0)) - orig_image = Image.open(im_path) - no_bg_image.paste(orig_image, mask=pil_im) - no_bg_image.save("example_image_no_bg.png") - - visualize_result(orig_image, pil_im, no_bg_image); - - - -.. image:: rmbg-background-removal-with-output_files/rmbg-background-removal-with-output_8_0.png - - -Convert Model to OpenVINO Intermediate Representation format ------------------------------------------------------------- - - - -OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate -Representation (IR). `OpenVINO model conversion -API `__ -should be used for these purposes. ``ov.convert_model`` function accepts -original PyTorch model instance and example input for tracing and -returns ``ov.Model`` representing this model in OpenVINO framework. -Converted model can be used for saving on disk using ``ov.save_model`` -function or directly loading on device using ``core.complie_model``. - -.. code:: ipython3 - - import openvino as ov - - ov_model_path = Path("rmbg-1.4.xml") - - if not ov_model_path.exists(): - ov_model = ov.convert_model(net, example_input=image, input=[1, 3, *model_input_size]) - ov.save_model(ov_model, ov_model_path) - -Run OpenVINO model inference ----------------------------- - - - -After finishing conversion, we can compile converted model and run it -using OpenVINO on specified device. For selection inference device, -please use dropdown list below: - -.. code:: ipython3 - - from notebook_utils import device_widget - - core = ov.Core() - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO') - - - -Let’s run model on the same image that we used before for launching -PyTorch model. OpenVINO model input and output is fully compatible with -original pre- and postprocessing steps, it means that we can reuse them. - -.. code:: ipython3 - - ov_compiled_model = core.compile_model(ov_model_path, device.value) - - result = ov_compiled_model(image)[0] - - # post process - result_image = postprocess_image(torch.from_numpy(result), orig_im_size) - - # save result - pil_im = Image.fromarray(result_image) - no_bg_image = Image.new("RGBA", pil_im.size, (0, 0, 0, 0)) - orig_image = Image.open(im_path) - no_bg_image.paste(orig_image, mask=pil_im) - no_bg_image.save("example_image_no_bg.png") - - visualize_result(orig_image, pil_im, no_bg_image); - - - -.. image:: rmbg-background-removal-with-output_files/rmbg-background-removal-with-output_14_0.png - - -Interactive demo ----------------- - - - -.. code:: ipython3 - - def get_background_mask(model, image): - return model(image)[0] - - - def on_submit(image): - original_image = image.copy() - - h, w = image.shape[:2] - image = preprocess_image(original_image, model_input_size) - - mask = get_background_mask(ov_compiled_model, image) - result_image = postprocess_image(torch.from_numpy(mask), (h, w)) - pil_im = Image.fromarray(result_image) - orig_img = Image.fromarray(original_image) - no_bg_image = Image.new("RGBA", pil_im.size, (0, 0, 0, 0)) - no_bg_image.paste(orig_img, mask=pil_im) - - return no_bg_image - -.. code:: ipython3 - - import requests - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/rmbg-background-removal/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=on_submit) - - try: - demo.launch(debug=True) - except Exception: - demo.launch(share=True, debug=True) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ diff --git a/docs/notebooks/rmbg-background-removal-with-output_files/rmbg-background-removal-with-output_14_0.png b/docs/notebooks/rmbg-background-removal-with-output_files/rmbg-background-removal-with-output_14_0.png deleted file mode 100644 index fc66982690d557..00000000000000 --- a/docs/notebooks/rmbg-background-removal-with-output_files/rmbg-background-removal-with-output_14_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7ffceb125f9d03bf8ac52c1c2546ed5a5dc1412eec250f95973fd7022fb1a9b2 -size 1082425 diff --git a/docs/notebooks/rmbg-background-removal-with-output_files/rmbg-background-removal-with-output_8_0.png b/docs/notebooks/rmbg-background-removal-with-output_files/rmbg-background-removal-with-output_8_0.png deleted file mode 100644 index 56eefe85542fac..00000000000000 --- a/docs/notebooks/rmbg-background-removal-with-output_files/rmbg-background-removal-with-output_8_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1141c35a792e2921e23ed8fec4ddcf5535ce6c75c4d43f076f93b1632033f612 -size 1082450 diff --git a/docs/notebooks/s3d-mil-nce-text-to-video-retrieval-with-output.rst b/docs/notebooks/s3d-mil-nce-text-to-video-retrieval-with-output.rst deleted file mode 100644 index dd112a3bdaaced..00000000000000 --- a/docs/notebooks/s3d-mil-nce-text-to-video-retrieval-with-output.rst +++ /dev/null @@ -1,523 +0,0 @@ -Text-to-Video retrieval with S3D MIL-NCE and OpenVINO -===================================================== - -This tutorial based on `the TensorFlow -tutorial `__ -that demonstrates how to use the `S3D -MIL-NCE `__ model from -TensorFlow Hub to do text-to-video retrieval to find the most similar -videos for a given text query. - -MIL-NCE inherits from Multiple Instance Learning (MIL) and Noise -Contrastive Estimation (NCE). The method is capable of addressing -visually misaligned narrations from uncurated instructional videos. Two -model variations are available with different 3D CNN backbones: I3D and -S3D. In this tutorial we use S3D variation. More details about the -training and the model can be found in `End-to-End Learning of Visual -Representations from Uncurated Instructional -Videos `__ paper. - -This tutorial demonstrates step-by-step instructions on how to run and -optimize S3D MIL-NCE model with OpenVINO. An additional part -demonstrates how to run quantization with -`NNCF `__ to speed up the -inference. - -The tutorial consists of the following steps: - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `The original inference <#the-original-inference>`__ -- `Convert the model to OpenVINO - IR <#convert-the-model-to-openvino-ir>`__ -- `Compiling models <#compiling-models>`__ -- `Inference <#inference>`__ -- `Optimize model using NNCF Post-training Quantization - API <#optimize-model-using-nncf-post-training-quantization-api>`__ - - - `Prepare dataset <#prepare-dataset>`__ - - `Perform model quantization <#perform-model-quantization>`__ - -- `Run quantized model inference <#run-quantized-model-inference>`__ - - - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install --upgrade --pre openvino-tokenizers "openvino>=2024.2.0" --extra-index-url "https://storage.openvinotoolkit.org/simple/wheels/nightly" - %pip install -q "tensorflow-macos>=2.5; sys_platform == 'darwin' and platform_machine == 'arm64' and python_version > '3.8'" # macOS M1 and M2 - %pip install -q "tensorflow>=2.5; sys_platform == 'darwin' and platform_machine != 'arm64' and python_version > '3.8'" # macOS x86 - %pip install -q "tensorflow>=2.5; sys_platform != 'darwin' and python_version > '3.8'" - - %pip install -q --no-deps tensorflow_hub - %pip install -q tf_keras numpy "opencv-python" "nncf>=2.10.0" - %pip install -q "matplotlib>=3.4" - -.. code:: ipython3 - - import os - import requests - from pathlib import Path - - import tensorflow as tf - import tensorflow_hub as hub - - import numpy as np - import cv2 - from IPython import display - import math - - os.environ["TFHUB_CACHE_DIR"] = str(Path("./tfhub_modules").resolve()) - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("s3d-mil-nce-text-to-video-retrieval.ipynb") - -Download the model - -.. code:: ipython3 - - hub_handle = "https://www.kaggle.com/models/deepmind/mil-nce/TensorFlow1/s3d/1" - hub_model = hub.load(hub_handle) - -The model has 2 signatures, one for generating video embeddings and one -for generating text embeddings. We will use these embedding to find the -nearest neighbors in the embedding space as in the original tutorial. -Below we will define auxiliary functions - -.. code:: ipython3 - - def generate_embeddings(model, input_frames, input_words): - """Generate embeddings from the model from video frames and input words.""" - # Input_frames must be normalized in [0, 1] and of the shape Batch x T x H x W x 3 - vision_output = model.signatures["video"](tf.constant(tf.cast(input_frames, dtype=tf.float32))) - text_output = model.signatures["text"](tf.constant(input_words)) - - return vision_output["video_embedding"], text_output["text_embedding"] - -.. code:: ipython3 - - # @title Define video loading and visualization functions { display-mode: "form" } - - - # Utilities to open video files using CV2 - def crop_center_square(frame): - y, x = frame.shape[0:2] - min_dim = min(y, x) - start_x = (x // 2) - (min_dim // 2) - start_y = (y // 2) - (min_dim // 2) - return frame[start_y : start_y + min_dim, start_x : start_x + min_dim] - - - def load_video(video_url, max_frames=32, resize=(224, 224)): - if video_url.startswith("http"): - path = tf.keras.utils.get_file(os.path.basename(video_url)[-128:], video_url, cache_dir=".", cache_subdir="data") - else: - path = video_url - cap = cv2.VideoCapture(path) - frames = [] - try: - while True: - ret, frame = cap.read() - if not ret: - break - frame = crop_center_square(frame) - frame = cv2.resize(frame, resize) - frame = frame[:, :, [2, 1, 0]] - frames.append(frame) - - if len(frames) == max_frames: - break - finally: - cap.release() - frames = np.array(frames) - if len(frames) < max_frames: - n_repeat = int(math.ceil(max_frames / float(len(frames)))) - frames = frames.repeat(n_repeat, axis=0) - frames = frames[:max_frames] - return frames / 255.0 - - - def display_video(urls): - html = "" - html += "" - for url in urls: - html += "" - html += "
Video 1Video 2Video 3
" - html += ''.format(url) - html += "
" - return display.HTML(html) - - - def display_query_and_results_video(query, urls, scores): - """Display a text query and the top result videos and scores.""" - sorted_ix = np.argsort(-scores) - html = "" - html += "

Input query: {}

".format(query) - html += "Results:
" - html += "" - html += "".format(scores[sorted_ix[0]]) - html += "".format(scores[sorted_ix[1]]) - html += "".format(scores[sorted_ix[2]]) - for i, idx in enumerate(sorted_ix): - url = urls[sorted_ix[i]] - html += "" - html += "
Rank #1, Score:{:.2f}Rank #2, Score:{:.2f}Rank #3, Score:{:.2f}
" - html += ''.format(url) - html += "
" - - return html - -.. code:: ipython3 - - video_1_url = "https://upload.wikimedia.org/wikipedia/commons/b/b0/YosriAirTerjun.gif" - video_2_url = "https://upload.wikimedia.org/wikipedia/commons/e/e6/Guitar_solo_gif.gif" - video_3_url = "https://upload.wikimedia.org/wikipedia/commons/3/30/2009-08-16-autodrift-by-RalfR-gif-by-wau.gif" - - video_1_path = Path("data/YosriAirTerjun.gif") - video_2_path = Path("data/Guitar_solo_gif.gif") - video_3_path = Path("data/2009-08-16-autodrift-by-RalfR-gif-by-wau.gif") - video_1 = load_video(video_1_url if not video_1_path.exists() else video_1_path) - video_2 = load_video(video_2_url if not video_2_path.exists() else video_2_path) - video_3 = load_video(video_3_url if not video_3_path.exists() else video_3_path) - all_videos = [video_1, video_2, video_3] - - query_1_video = "waterfall" # @param {type:"string"} - query_2_video = "playing guitar" # @param {type:"string"} - query_3_video = "car drifting" # @param {type:"string"} - all_queries_video = [query_1_video, query_2_video, query_3_video] - all_videos_urls = [video_1_url, video_2_url, video_3_url] - display_video(all_videos_urls) - - - - -.. raw:: html - -
Video 1Video 2Video 3
- - - -The original inference ----------------------- - - - -.. code:: ipython3 - - # Prepare video inputs. - videos_np = np.stack(all_videos, axis=0) - - # Prepare text input. - words_np = np.array(all_queries_video) - - # Generate the video and text embeddings. - video_embd, text_embd = generate_embeddings(hub_model, videos_np, words_np) - - # Scores between video and text is computed by dot products. - all_scores = np.dot(text_embd, tf.transpose(video_embd)) - -.. code:: ipython3 - - # Display results. - html = "" - for i, words in enumerate(words_np): - html += display_query_and_results_video(words, all_videos_urls, all_scores[i, :]) - html += "
" - display.HTML(html) - - - - -.. raw:: html - -

Input query: waterfall

Results:
Rank #1, Score:4.71Rank #2, Score:-1.63Rank #3, Score:-4.17

Input query: playing guitar

Results:
Rank #1, Score:6.50Rank #2, Score:-1.79Rank #3, Score:-2.67

Input query: car drifting

Results:
Rank #1, Score:8.78Rank #2, Score:-1.07Rank #3, Score:-2.17

- - - -Convert the model to OpenVINO IR --------------------------------- - -OpenVINO supports TensorFlow -models via conversion into Intermediate Representation (IR) format. We -need to provide a model object, input data for model tracing to -``ov.convert_model`` function to obtain OpenVINO ``ov.Model`` object -instance. Model can be saved on disk for next deployment using -``ov.save_model`` function. - -.. code:: ipython3 - - import openvino_tokenizers # NOQA Need to import conversion and operation extensions - import openvino as ov - - model_path = hub.resolve(hub_handle) - # infer on random data - images_data = np.random.rand(3, 32, 224, 224, 3).astype(np.float32) - words_data = np.array(["First sentence", "Second one", "Abracadabra"], dtype=str) - - ov_model = ov.convert_model(model_path, input=[("words", [3]), ("images", [3, 32, 224, 224, 3])]) - -Compiling models ----------------- - - - -Only CPU is supported for this model due to strings as input. - -.. code:: ipython3 - - core = ov.Core() - - compiled_model = core.compile_model(ov_model, device_name="CPU") - -Inference ---------- - - - -.. code:: ipython3 - - # Redefine `generate_embeddings` function to make it possible to use the compile IR model. - def generate_embeddings(model, input_frames, input_words): - """Generate embeddings from the model from video frames and input words.""" - # Input_frames must be normalized in [0, 1] and of the shape Batch x T x H x W x 3 - output = compiled_model({"words": input_words, "images": tf.cast(input_frames, dtype=tf.float32)}) - - return output["video_embedding"], output["text_embedding"] - -.. code:: ipython3 - - # Generate the video and text embeddings. - video_embd, text_embd = generate_embeddings(compiled_model, videos_np, words_np) - - # Scores between video and text is computed by dot products. - all_scores = np.dot(text_embd, tf.transpose(video_embd)) - -.. code:: ipython3 - - # Display results. - html = "" - for i, words in enumerate(words_np): - html += display_query_and_results_video(words, all_videos_urls, all_scores[i, :]) - html += "
" - display.HTML(html) - - - - -.. raw:: html - -

Input query: waterfall

Results:
Rank #1, Score:4.71Rank #2, Score:-1.63Rank #3, Score:-4.17

Input query: playing guitar

Results:
Rank #1, Score:6.50Rank #2, Score:-1.79Rank #3, Score:-2.67

Input query: car drifting

Results:
Rank #1, Score:8.78Rank #2, Score:-1.07Rank #3, Score:-2.17

- - - -Optimize model using NNCF Post-training Quantization API --------------------------------------------------------- - - - -`NNCF `__ provides a suite of -advanced algorithms for Neural Networks inference optimization in -OpenVINO with minimal accuracy drop. We will use 8-bit quantization in -post-training mode (without the fine-tuning pipeline). The optimization -process contains the following steps: - -1. Create a Dataset for quantization. -2. Run ``nncf.quantize`` for getting an optimized model. -3. Serialize an OpenVINO IR model, using the ``ov.save_model`` function. - -Prepare dataset -~~~~~~~~~~~~~~~ - - - -This model doesn’t require a big dataset for calibration. We will use -only example videos for this purpose. NNCF provides ``nncf.Dataset`` -wrapper for using native framework dataloaders in quantization pipeline. -Additionally, we specify transform function that will be responsible for -preparing input data in model expected format. - -.. code:: ipython3 - - import nncf - - dataset = nncf.Dataset(((words_np, tf.cast(videos_np, dtype=tf.float32)),)) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, openvino - - -Perform model quantization -~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The ``nncf.quantize`` function provides an interface for model -quantization. It requires an instance of the OpenVINO Model and -quantization dataset. Optionally, some additional parameters for the -configuration quantization process (number of samples for quantization, -preset, ignored scope etc.) can be provided. - -.. code:: ipython3 - - MODEL_DIR = Path("model/") - MODEL_DIR.mkdir(exist_ok=True) - - quantized_model_path = MODEL_DIR / "quantized_model.xml" - - - if not quantized_model_path.exists(): - quantized_model = nncf.quantize(model=ov_model, calibration_dataset=dataset, model_type=nncf.ModelType.TRANSFORMER) - ov.save_model(quantized_model, quantized_model_path) - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - INFO:nncf:39 ignored nodes were found by name in the NNCFGraph - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -Run quantized model inference ------------------------------ - - - -There are no changes in model usage after applying quantization. Let’s -check the model work on the previously used example. - -.. code:: ipython3 - - int8_model = core.compile_model(quantized_model_path, device_name="CPU") - -.. code:: ipython3 - - # Generate the video and text embeddings. - video_embd, text_embd = generate_embeddings(int8_model, videos_np, words_np) - - # Scores between video and text is computed by dot products. - all_scores = np.dot(text_embd, tf.transpose(video_embd)) - -.. code:: ipython3 - - # Display results. - html = "" - for i, words in enumerate(words_np): - html += display_query_and_results_video(words, all_videos_urls, all_scores[i, :]) - html += "
" - display.HTML(html) - - - - -.. raw:: html - -

Input query: waterfall

Results:
Rank #1, Score:4.71Rank #2, Score:-1.63Rank #3, Score:-4.17

Input query: playing guitar

Results:
Rank #1, Score:6.50Rank #2, Score:-1.79Rank #3, Score:-2.67

Input query: car drifting

Results:
Rank #1, Score:8.78Rank #2, Score:-1.07Rank #3, Score:-2.17

- - diff --git a/docs/notebooks/sana-image-generation-with-output.rst b/docs/notebooks/sana-image-generation-with-output.rst deleted file mode 100644 index def06b95bd0410..00000000000000 --- a/docs/notebooks/sana-image-generation-with-output.rst +++ /dev/null @@ -1,352 +0,0 @@ -Image generation with Sana and OpenVINO -======================================= - -Sana is a text-to-image framework that can efficiently generate images -up to 4096 × 4096 resolution developed by NVLabs. Sana can synthesize -high-resolution, high-quality images with strong text-image alignment at -a remarkably fast speed, deployable on laptop GPU. Core designs include: -\* **Deep compression autoencoder**: unlike traditional AEs, which -compress images only 8×, we trained an AE that can compress images 32×, -effectively reducing the number of latent tokens. \* **Linear DiT**: -authors replaced all vanilla attention in DiT with linear attention, -which is more efficient at high resolutions without sacrificing quality. -\* **Decoder-only text encoder**\ *: T5 replaced by modern decoder-only -small LLM as the text encoder and designed complex human instruction -with in-context learning to enhance the image-text alignment.* -**Efficient training and sampling**: Proposed Flow-DPM-Solver to reduce -sampling steps, with efficient caption labeling and selection to -accelerate convergence. - -More details about model can be found in -`paper `__, `model -page `__ and `original -repo `__. In this tutorial, we consider -how to optimize and run Sana model using OpenVINO. - -**Table of contents:** - -- `Prerequisites <#prerequisites>`__ -- `Select model variant <#select-model-variant>`__ -- `Convert and Optimize model with - OpenVINO <#convert-and-optimize-model-with-openvino>`__ - - - `Convert model using Optimum - Intel <#convert-model-using-optimum-intel>`__ - - `Compress model weights <#compress-model-weights>`__ - -- `Run OpenVINO model inference <#run-openvino-model-inference>`__ -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import platform - - %pip install -q "gradio>=4.19" "torch>=2.1" "transformers" "nncf>=2.14.0" "diffusers>=0.32.0" "opencv-python" "pillow" "peft>=0.7.0" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "sentencepiece" "protobuf" - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -qU "openvino>=2024.6.0" - - if platform.system() == "Darwin": - %pip install "numpy<2.0" - -.. code:: ipython3 - - from pathlib import Path - import requests - - helpers = ["notebook_utils.py", "cmd_helper.py"] - base_url = "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils" - - for helper in helpers: - if not Path(helper).exists(): - r = requests.get(f"{base_url}/{helper}") - with open(helper, "w") as f: - f.write(r.text) - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/sana-image-generation/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("sana-image-generation.ipynb") - -Select model variant --------------------- - - - -.. code:: ipython3 - - import ipywidgets as widgets - - model_ids = [ - "Efficient-Large-Model/Sana_600M_512px_diffusers", - "Efficient-Large-Model/Sana_600M_1024px_diffusers", - "Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers", - "Efficient-Large-Model/Sana_1600M_2Kpx_BF16_diffusers", - "Efficient-Large-Model/Sana_1600M_4Kpx_BF16_diffusers", - ] - - model_selector = widgets.Dropdown( - options=model_ids, - default=model_ids[0], - description="Model:", - ) - - - model_selector - - - - -.. parsed-literal:: - - Dropdown(description='Model:', options=('Efficient-Large-Model/Sana_600M_512px_diffusers', 'Efficient-Large-Mo… - - - -Convert and Optimize model with OpenVINO ----------------------------------------- - - - -Starting from 2023.0 release, OpenVINO supports PyTorch models directly -via Model Conversion API. ``ov.convert_model`` function accepts instance -of PyTorch model and example inputs for tracing and returns object of -``ov.Model`` class, ready to use or save on disk using ``ov.save_model`` -function. - -The pipeline consists of four important parts: - -- Gemma Text Encoder to create condition to generate an image from a - text prompt. -- Transformer for step-by-step denoising latent image representation. -- Deep Compression Autoencoder (DCAE) for decoding latent space to - image. - -Convert model using Optimum Intel -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -For convenience, we will use OpenVINO integration with HuggingFace -Optimum. `Optimum -Intel `__ is the -interface between the Transformers and Diffusers libraries and the -different tools and libraries provided by Intel to accelerate end-to-end -pipelines on Intel architectures. - -Among other use cases, Optimum Intel provides a simple interface to -optimize your Transformers and Diffusers models, convert them to the -OpenVINO Intermediate Representation (IR) format and run inference using -OpenVINO Runtime. ``optimum-cli`` provides command line interface for -model conversion and optimization. - -General command format: - -.. code:: bash - - optimum-cli export openvino --model --task - -where task is task to export the model for, if not specified, the task -will be auto-inferred based on the model (in case of image generation, -**text-to-image** should be selected). You can find a mapping between -tasks and model classes in Optimum TaskManager -`documentation `__. -Additionally, you can specify weights compression using -``--weight-format`` argument with one of following options: ``fp32``, -``fp16``, ``int8`` and ``int4``. For int8 and int4 -`nncf `__ will be used for -weight compression. More details about model export provided in `Optimum -Intel -documentation `__. - -.. code:: ipython3 - - from pathlib import Path - - model_id = model_selector.value - variant = "fp16" if "BF16" not in model_id else "bf16" - - model_dir = Path(model_id.split("/")[-1]) - - additional_args = {"variant": variant, "weight-format": "fp16"} - -.. code:: ipython3 - - from cmd_helper import optimum_cli - - if not model_dir.exists(): - optimum_cli(model_id, model_dir, additional_args=additional_args) - -Compress model weights -~~~~~~~~~~~~~~~~~~~~~~ - - - -For reducing model memory consumption we will use weights compression. -The `Weights -Compression `__ -algorithm is aimed at compressing the weights of the models and can be -used to optimize the model footprint and performance of large models -where the size of weights is relatively larger than the size of -activations, for example, Large Language Models (LLM). Compared to INT8 -compression, INT4 compression improves performance even more, but -introduces a minor drop in prediction quality. We will use -`NNCF `__ for transformer -weight compression. - -.. code:: ipython3 - - to_compress = widgets.Checkbox( - value=True, - description="Weight compression", - disabled=False, - ) - - to_compress - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Weight compression') - - - -.. code:: ipython3 - - import openvino as ov - import nncf - import gc - - compressed_transformer = Path(model_dir) / "transformer/openvino_model_i4.xml" - - if to_compress.value and not compressed_transformer.exists(): - core = ov.Core() - - ov_model = core.read_model(model_dir / "transformer/openvino_model.xml") - - compressed_model = nncf.compress_weights(ov_model, mode=nncf.CompressWeightsMode.INT4_SYM, group_size=64, ratio=1.0) - ov.save_model(compressed_model, compressed_transformer) - del compressed_model - del ov_model - - gc.collect(); - - - - - - - - - - - - - - - - - -Run OpenVINO model inference ----------------------------- - - - -``OVDiffusionPipeline`` from Optimum Intel provides ready-to-use -interface for running Diffusers models using OpenVINO. It supports -various models including Stable Diffusion, Stable Diffusion XL, LCM, -Stable Diffusion v3 and Flux. Similar to original Diffusers pipeline, -for initialization, we should use ``from_preptrained`` method providing -model id from HuggingFace hub or local directory (both original PyTorch -and OpenVINO models formats supported, in the first case model class -additionally will trigger model conversion). - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget(default="CPU", exclude=["NPU"]) - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -.. code:: ipython3 - - from optimum.intel.openvino import OVDiffusionPipeline - - ov_pipe = OVDiffusionPipeline.from_pretrained(model_dir, device=device.value, transformer_file_name=compressed_transformer.name if to_compress.value else None) - -.. code:: ipython3 - - import torch - - prompt = "Cute 🐶 Wearing 🕶 flying on the 🌈" - - image = ov_pipe( - prompt, - generator=torch.Generator("cpu").manual_seed(1234563), - ).images[0] - - image - - - -.. parsed-literal:: - - 0%| | 0/20 [00:00`__, -trained for real-time synthesis. SDXL Turbo is based on a novel -distillation technique called Adversarial Diffusion Distillation (ADD), -which enables the model to synthesize image outputs in a single step and -generate real-time text-to-image outputs while maintaining high sampling -fidelity. More details about this distillation approach can be found in -`technical -report `__. -More details about model can be found in `Stability AI blog -post `__. - -Previously, we already discussed how to launch Stable Diffusion XL model -using OpenVINO in the following `notebook `__, -in this tutorial we will focus on the -`SDXL-turbo `__ version. -Additionally, to improve image decoding speed, we will use `Tiny -Autoencoder `__, which is useful -for real-time previewing of the SDXL generation process. - -We will use a pre-trained model from the `Hugging Face -Diffusers `__ library. To -simplify the user experience, the `Hugging Face Optimum -Intel `__ library is -used to convert the models to OpenVINO™ IR format. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Convert model to OpenVINO - format <#convert-model-to-openvino-format>`__ -- `Text-to-image generation <#text-to-image-generation>`__ - - - `Select inference device for text-to-image - generation <#select-inference-device-for-text-to-image-generation>`__ - -- `Image-to-Image generation <#image-to-image-generation>`__ -- `Quantization <#quantization>`__ - - - `Prepare calibration dataset <#prepare-calibration-dataset>`__ - - `Run quantization <#run-quantization>`__ - - - `Compare UNet file size <#compare-unet-file-size>`__ - - - `Compare inference time of the FP16 and INT8 - models <#compare-inference-time-of-the-fp16-and-int8-models>`__ - -- `Interactive Demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu \ - "torch>=2.1" transformers "diffusers>=0.24.0" "git+https://github.com/huggingface/optimum-intel.git" "gradio>=4.19" "peft>=0.6.2" "openvino>=2023.3.0" - -Convert model to OpenVINO format --------------------------------- - - - -`sdxl-turbo `__ is -available for downloading via the `HuggingFace -hub `__. We will use optimum-cli -interface for exporting it into OpenVINO Intermediate Representation -(IR) format. - -Optimum CLI interface for converting models supports export to OpenVINO -(supported starting optimum-intel 1.12 version). General command format: - -.. code:: bash - - optimum-cli export openvino --model --task - -where task is task to export the model for, if not specified, the task -will be auto-inferred based on the model. Available tasks depend on the -model, for sdxl should be selected ``stable-diffusion-xl`` - -You can find a mapping between tasks and model classes in Optimum -TaskManager -`documentation `__. - -Additionally, you can specify weights compression ``--weight-format`` -for the model compression. Please note, that for INT8/INT4, it is -necessary to install nncf. - -Full list of supported arguments available via ``--help`` For more -details and examples of usage, please check `optimum -documentation `__. - -For Tiny Autoencoder, we will use ``ov.convert_model`` function for -obtaining ``ov.Model`` and save it using ``ov.save_model``. Model -consists of 2 parts that used in pipeline separately: ``vae_encoder`` -for encoding input image in latent space in image-to-image generation -task and ``vae_decoder`` that responsible for decoding diffusion result -back to image format. - -.. code:: ipython3 - - from pathlib import Path - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("sdxl-turbo.ipynb") - - model_dir = Path("./model") - sdxl_model_id = "stabilityai/sdxl-turbo" - tae_id = "madebyollin/taesdxl" - skip_convert_model = model_dir.exists() - -.. code:: ipython3 - - import torch - import openvino as ov - from diffusers import AutoencoderTiny - import gc - - - class VAEEncoder(torch.nn.Module): - def __init__(self, vae): - super().__init__() - self.vae = vae - - def forward(self, sample): - return self.vae.encode(sample) - - - class VAEDecoder(torch.nn.Module): - def __init__(self, vae): - super().__init__() - self.vae = vae - - def forward(self, latent_sample): - return self.vae.decode(latent_sample) - - - def convert_tiny_vae(model_id, output_path): - tiny_vae = AutoencoderTiny.from_pretrained(model_id) - tiny_vae.eval() - vae_encoder = VAEEncoder(tiny_vae) - ov_model = ov.convert_model(vae_encoder, example_input=torch.zeros((1, 3, 512, 512))) - ov.save_model(ov_model, output_path / "vae_encoder/openvino_model.xml") - tiny_vae.save_config(output_path / "vae_encoder") - vae_decoder = VAEDecoder(tiny_vae) - ov_model = ov.convert_model(vae_decoder, example_input=torch.zeros((1, 4, 64, 64))) - ov.save_model(ov_model, output_path / "vae_decoder/openvino_model.xml") - tiny_vae.save_config(output_path / "vae_decoder") - - - if not skip_convert_model: - !optimum-cli export openvino --model $sdxl_model_id --task stable-diffusion-xl $model_dir --weight-format fp16 - convert_tiny_vae(tae_id, model_dir) - -Text-to-image generation ------------------------- - - - -Text-to-image generation lets you create images using text description. -To start generating images, we need to load models first. To load an -OpenVINO model and run an inference with Optimum and OpenVINO Runtime, -you need to replace diffusers ``StableDiffusionXLPipeline`` with Optimum -``OVStableDiffusionXLPipeline``. Pipeline initialization starts with -using ``from_pretrained`` method, where a directory with OpenVINO models -should be passed. Additionally, you can specify an inference device. - -Select inference device for text-to-image generation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - -.. code:: ipython3 - - from optimum.intel.openvino import OVStableDiffusionXLPipeline - - text2image_pipe = OVStableDiffusionXLPipeline.from_pretrained(model_dir, device=device.value) - - -.. parsed-literal:: - - Compiling the vae_decoder to AUTO ... - Compiling the unet to AUTO ... - Compiling the text_encoder to AUTO ... - Compiling the text_encoder_2 to AUTO ... - Compiling the vae_encoder to AUTO ... - - -The pipeline interface is similar to original -``StableDiffusionXLPipeline``. We should provide text prompt. The -default number of steps is 50, while sdxl-turbo required only 1 step. -According to the information provided in model card, model does not use -negative prompt and guidance scale and this parameters should be -disabled using ``guidance_scale = 0`` - -.. code:: ipython3 - - import numpy as np - - prompt = "cute cat" - image = text2image_pipe( - prompt, - num_inference_steps=1, - height=512, - width=512, - guidance_scale=0.0, - generator=np.random.RandomState(987), - ).images[0] - image.save("cat.png") - image - - - -.. parsed-literal:: - - 0%| | 0/1 [00:00= 1 after applying strength multiplication. e.g. in example -below, we will use ``num_inference_steps=2`` and ``stength=0.5``, -finally, we get 0.5 \* 2.0 = 1 step in our pipeline. - -.. code:: ipython3 - - photo_image = image2image_pipe( - photo_prompt, - image=image, - num_inference_steps=2, - generator=np.random.RandomState(511), - guidance_scale=0.0, - strength=0.5, - ).images[0] - photo_image.save("cat_tie.png") - photo_image - - - -.. parsed-literal:: - - 0%| | 0/1 [00:00`__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -According to ``SDXL-Turbo Model`` structure, the UNet model takes up -significant portion of the overall pipeline execution time. Now we will -show you how to optimize the UNet part using -`NNCF `__ to reduce -computation cost and speed up the pipeline. Quantizing the rest of the -SDXL pipeline does not significantly improve inference performance but -can lead to a substantial degradation of accuracy. - -The optimization process contains the following steps: - -1. Create a calibration dataset for quantization. -2. Run ``nncf.quantize()`` to obtain quantized model. -3. Save the ``INT8`` model using ``openvino.save_model()`` function. - -Please select below whether you would like to run quantization to -improve model inference speed. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - skip_for_device = "GPU" in device.value - to_quantize = quantization_widget(not skip_for_device) - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - import requests - - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - int8_pipe = None - - core = ov.Core() - - %load_ext skip_kernel_extension - -Prepare calibration dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We use a portion of -`conceptual_captions `__ -dataset from Hugging Face as calibration data. To collect intermediate -model inputs for calibration we should customize ``CompiledModel``. - -.. code:: ipython3 - - UNET_INT8_OV_PATH = model_dir / "optimized_unet" / "openvino_model.xml" - - - def disable_progress_bar(pipeline, disable=True): - if not hasattr(pipeline, "_progress_bar_config"): - pipeline._progress_bar_config = {"disable": disable} - else: - pipeline._progress_bar_config["disable"] = disable - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import datasets - import numpy as np - from tqdm.notebook import tqdm - from transformers import set_seed - from typing import Any, Dict, List - - set_seed(1) - - class CompiledModelDecorator(ov.CompiledModel): - def __init__(self, compiled_model: ov.CompiledModel, data_cache: List[Any] = None): - super().__init__(compiled_model) - self.data_cache = data_cache if data_cache else [] - - def __call__(self, *args, **kwargs): - self.data_cache.append(*args) - return super().__call__(*args, **kwargs) - - def collect_calibration_data(pipe, subset_size: int) -> List[Dict]: - original_unet = pipe.unet.request - pipe.unet.request = CompiledModelDecorator(original_unet) - - dataset = datasets.load_dataset("google-research-datasets/conceptual_captions", split="train", trust_remote_code=True).shuffle(seed=42) - disable_progress_bar(pipe) - - # Run inference for data collection - pbar = tqdm(total=subset_size) - diff = 0 - for batch in dataset: - prompt = batch["caption"] - if len(prompt) > pipe.tokenizer.model_max_length: - continue - _ = pipe( - prompt, - num_inference_steps=1, - height=512, - width=512, - guidance_scale=0.0, - generator=np.random.RandomState(987) - ) - collected_subset_size = len(pipe.unet.request.data_cache) - if collected_subset_size >= subset_size: - pbar.update(subset_size - pbar.n) - break - pbar.update(collected_subset_size - diff) - diff = collected_subset_size - - calibration_dataset = pipe.unet.request.data_cache - disable_progress_bar(pipe, disable=False) - pipe.unet.request = original_unet - return calibration_dataset - -.. code:: ipython3 - - %%skip not $to_quantize.value - - if not UNET_INT8_OV_PATH.exists(): - text2image_pipe = OVStableDiffusionXLPipeline.from_pretrained(model_dir, device=device.value) - unet_calibration_data = collect_calibration_data(text2image_pipe, subset_size=200) - - -.. parsed-literal:: - - Compiling the vae_decoder to AUTO ... - Compiling the unet to AUTO ... - Compiling the text_encoder_2 to AUTO ... - Compiling the vae_encoder to AUTO ... - Compiling the text_encoder to AUTO ... - - - -.. parsed-literal:: - - 0%| | 0/200 [00:00= validation_size: - break - disable_progress_bar(pipe, disable=False) - return np.median(inference_time) - -.. code:: ipython3 - - %%skip not $to_quantize.value - - int8_latency = calculate_inference_time(int8_text2image_pipe, validation_data) - text2image_pipe = OVStableDiffusionXLPipeline.from_pretrained(model_dir, device=device.value) - fp_latency = calculate_inference_time(text2image_pipe, validation_data) - print(f"FP16 pipeline latency: {fp_latency:.3f}") - print(f"INT8 pipeline latency: {int8_latency:.3f}") - print(f"Text-to-Image generation speed up: {fp_latency / int8_latency:.3f}") - - -.. parsed-literal:: - - Compiling the vae_decoder to AUTO ... - Compiling the unet to AUTO ... - Compiling the vae_encoder to AUTO ... - Compiling the text_encoder to AUTO ... - Compiling the text_encoder_2 to AUTO ... - - -.. parsed-literal:: - - FP16 pipeline latency: 1.775 - INT8 pipeline latency: 0.673 - Text-to-Image generation speed up: 2.636 - - -Interactive Demo ----------------- - - - -Now, you can check model work using own text descriptions. Provide text -prompt in the text box and launch generation using Run button. -Additionally you can control generation with additional parameters: \* -Seed - random seed for initialization \* Steps - number of generation -steps \* Height and Width - size of generated image - - Please note that increasing image size may require to increasing - number of steps for accurate result. We recommend running 104x1024 - resolution image generation using 4 steps. - -Please select below whether you would like to use the quantized model to -launch the interactive demo. - -.. code:: ipython3 - - import ipywidgets as widgets - - quantized_model_present = UNET_INT8_OV_PATH.exists() - - use_quantized_model = widgets.Checkbox( - value=True if quantized_model_present else False, - description="Use quantized model", - disabled=False, - ) - - use_quantized_model - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Use quantized model') - - - -.. code:: ipython3 - - text2image_pipe = OVStableDiffusionXLPipeline.from_pretrained(model_dir, device=device.value) - if use_quantized_model.value: - if not quantized_model_present: - raise RuntimeError("Quantized model not found.") - text2image_pipe.unet.model = core.read_model(UNET_INT8_OV_PATH) - text2image_pipe.unet.request = core.compile_model(text2image_pipe.unet.model, device.value) - - - def generate_from_text(text, seed, num_steps, height, width): - result = text2image_pipe( - text, - num_inference_steps=num_steps, - guidance_scale=0.0, - generator=np.random.RandomState(seed), - height=height, - width=width, - ).images[0] - return result - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/sdxl-turbo/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=generate_from_text, quantized=use_quantized_model.value) - - try: - demo.launch(debug=False) - except Exception: - demo.launch(share=True, debug=False) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ diff --git a/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_11_1.jpg b/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_11_1.jpg deleted file mode 100644 index 53dadd35dc3e79..00000000000000 --- a/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_11_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a31da96342d885a0513fabe53b007afbd31d9163627f814b9497c0584ad4833a -size 28123 diff --git a/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_11_1.png b/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_11_1.png deleted file mode 100644 index 972d6c7fa6e062..00000000000000 --- a/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_11_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6434cc1dccde2b97c66b8b70570f71d9deec583d03bab5e04832030114f4f3d3 -size 360072 diff --git a/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_17_1.jpg b/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_17_1.jpg deleted file mode 100644 index 23ee6e6dd5a207..00000000000000 --- a/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_17_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e21868cc922d1258fa72434537e0d821aa8081ec03bbb8e0cac87442c93d2ffd -size 28382 diff --git a/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_17_1.png b/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_17_1.png deleted file mode 100644 index 72c35b16c02b29..00000000000000 --- a/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_17_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ee60530f05b4d815af6d3641dd5bb6dfc88af1a7cacb314e3a47f0e288b52275 -size 373515 diff --git a/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_29_3.jpg b/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_29_3.jpg deleted file mode 100644 index 70c2a635c04974..00000000000000 --- a/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_29_3.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:885d458bd53a4b3f9d5b75c7cbff15a651866b3fea57470e3d540a7c34c4a7bf -size 27161 diff --git a/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_29_3.png b/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_29_3.png deleted file mode 100644 index c92a18f4e2f3e4..00000000000000 --- a/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_29_3.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c7a1c0c270b58da5666396ef9c414eac2764191272bf7a4cbf494faba031c339 -size 355149 diff --git a/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_30_3.jpg b/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_30_3.jpg deleted file mode 100644 index 52f5e9b4481df0..00000000000000 --- a/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_30_3.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:79c94cbc1e4d40fed43793a60ad1e95fc622fbf6d61fa7673e8ce16888d486d3 -size 26745 diff --git a/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_30_3.png b/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_30_3.png deleted file mode 100644 index 6142d132f024d9..00000000000000 --- a/docs/notebooks/sdxl-turbo-with-output_files/sdxl-turbo-with-output_30_3.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:542d6a4373239d096fb41e7c65409e53b0ad72be0be357d901ea613197b0ceaa -size 365761 diff --git a/docs/notebooks/segment-anything-2-image-with-output.rst b/docs/notebooks/segment-anything-2-image-with-output.rst deleted file mode 100644 index 619a889d57181e..00000000000000 --- a/docs/notebooks/segment-anything-2-image-with-output.rst +++ /dev/null @@ -1,1376 +0,0 @@ -Object masks from prompts with SAM2 and OpenVINO for Images -=========================================================== - -.. warning:: - - Important note: This notebook requires python >= 3.10. Please make - sure that your environment fulfill to this requirement before running - it - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Image prediction <#image-prediction>`__ -- `Prepare image <#prepare-image>`__ -- `Download and inference PyTorch - model <#download-and-inference-pytorch-model>`__ -- `Convert model to OpenVINO Intermediate - Representation <#convert-model-to-openvino-intermediate-representation>`__ - - - `Image Encoder <#image-encoder>`__ - - `Mask predictor <#mask-predictor>`__ - -- `Run OpenVINO model in interactive segmentation - mode <#run-openvino-model-in-interactive-segmentation-mode>`__ - - - `Example Image <#example-image>`__ - - `Preprocessing and visualization - utilities <#preprocessing-and-visualization-utilities>`__ - - `Image encoding <#image-encoding>`__ - - `Example point input <#example-point-input>`__ - - `Example with multiple points <#example-with-multiple-points>`__ - - `Example box and point input with negative - label <#example-box-and-point-input-with-negative-label>`__ - -- `Interactive segmentation with - Gradio <#interactive-segmentation-with-gradio>`__ -- `Run OpenVINO model in automatic mask generation - mode <#run-openvino-model-in-automatic-mask-generation-mode>`__ -- `Optimize encoder using NNCF Post-training Quantization - API <#optimize-encoder-using-nncf-post-training-quantization-api>`__ - - - `Prepare a calibration dataset <#prepare-a-calibration-dataset>`__ - - `Run quantization and serialize OpenVINO IR - model <#run-quantization-and-serialize-openvino-ir-model>`__ - - `Validate Quantized Model - Inference <#validate-quantized-model-inference>`__ - - `Compare Performance of the Original and Quantized - Models <#compare-performance-of-the-original-and-quantized-models>`__ - -Segmentation - identifying which image pixels belong to an object - is a -core task in computer vision and is used in a broad array of -applications, from analyzing scientific imagery to editing photos. But -creating an accurate segmentation model for specific tasks typically -requires highly specialized work by technical experts with access to AI -training infrastructure and large volumes of carefully annotated -in-domain data. Reducing the need for task-specific modeling expertise, -training compute, and custom data annotation for image segmentation is -the main goal of the Segment Anything project. - -`Segment Anything Model 2 (SAM 2) `__ is -a foundation model towards solving promptable visual segmentation in -images and videos. It extend SAM to video by considering images as a -video with a single frame. When SAM 2 is applied to images the model -behaves like SAM. SAM 2 has all the capabilities of SAM on static -images. The model first converts the image into an image embedding that -allows high quality masks to be efficiently produced from a prompt. A -promptable and light-weight mask decoder accepts a image embedding and -prompts (if any) on the current image and outputs a segmentation mask. -SAM 2 supports point, box, and mask prompts. - -This notebook shows an example of how to convert and use Segment -Anything Model 2 in OpenVINO format, allowing it to run on a variety of -platforms that support an OpenVINO. - -The diagram below demonstrates the SAM 2 architecture. For a given -frame, the segmentation prediction is conditioned on the current prompt -and/or on previously observed memories. Videos are processed in a -streaming fashion with frames being consumed one at a time by the image -encoder, and cross-attended to memories of the target object from -previous frames. The mask decoder, which optionally also takes input -prompts, predicts the segmentation mask for that frame. Finally, a -memory encoder transforms the prediction and image encoder embeddings -(not shown in the figure) for use in future frames - -.. figure:: https://raw.githubusercontent.com/facebookresearch/segment-anything-2/main/assets/model_diagram.png - :alt: model_diagram - - model_diagram - -More details about approach can be found in the -`paper `__, -original -`repo `__ and -`Meta AI blog post `__ - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "gradio>=4.13,<4.41.0" "fastapi==0.112.4" "openvino>=2024.4.0" "nncf>=2.13" "torch>=2.3.1" "torchvision>=0.18.1" opencv-python tqdm numpy --extra-index-url https://download.pytorch.org/whl/cpu - %pip install "iopath>=0.1.10" "pillow>=9.4.0" "hydra-core>=1.3.2" - - %pip install -q "matplotlib>=3.4" - -.. code:: ipython3 - - import requests - from pathlib import Path - - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - - from notebook_utils import download_file - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("segment-anything-2-image.ipynb") - -Clone and install segment-anything-2 - -.. code:: ipython3 - - from cmd_helper import clone_repo - - - repo_dir = clone_repo("https://github.com/facebookresearch/sam2.git") - - -.. parsed-literal:: - - env: SAM2_BUILD_CUDA=0 - - -.. code:: ipython3 - - %env SAM2_BUILD_CUDA=0 - %cd sam2 - %pip install -q -e . - %cd .. - -.. code:: ipython3 - - ov_sam2_helper_file_name = "ov_sam2_helper.py" - - if not Path(ov_sam2_helper_file_name).exists(): - r = requests.get( - url=f"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/sam2-image-segmentation/{ov_sam2_helper_file_name}", - ) - open(ov_sam2_helper_file_name, "w").write(r.text) - -Image prediction ----------------- - - - -Prepare image -------------- - - - -.. code:: ipython3 - - import numpy as np - import matplotlib.pyplot as plt - from PIL import Image - - image_path = Path("truck.jpg") - if not image_path.exists(): - download_file("https://raw.githubusercontent.com/facebookresearch/segment-anything/main/notebooks/images/truck.jpg") - - image = Image.open(image_path) - image = np.array(image.convert("RGB")) - - -.. parsed-literal:: - - 'truck.jpg' already exists. - - -.. code:: ipython3 - - plt.figure(figsize=(10, 10)) - plt.imshow(image) - plt.axis("on") - plt.show() - - - -.. image:: segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_12_0.png - - -Download and inference PyTorch model ------------------------------------- - - - -There are several `Segment Anything V2 -Models `__ -available for downloading. In this tutorial we will use model based on -``sam2-hiera-large``, but the demonstrated approach is very general and -applicable to other SAM models. - -SAM2 provide image prediction APIs that closely resemble SAM for image -use cases. The ``SAM2ImagePredictor`` class has an easy interface for -image prompting. Also the model could be loaded by -``SAM2ImagePredictor`` from -`HuggingFace `__. List -of other SAM2 models could be found -`here `__. - -.. code:: ipython3 - - from sam2.sam2_image_predictor import SAM2ImagePredictor - - predictor = SAM2ImagePredictor.from_pretrained("facebook/sam2-hiera-large", device="cpu") - -.. code:: ipython3 - - input_point = np.array([[500, 375]]) - input_label = np.array([1]) - -.. code:: ipython3 - - from ov_sam2_helper import show_points - - plt.figure(figsize=(10, 10)) - plt.imshow(image) - show_points(input_point, input_label, plt.gca()) - plt.axis("on") - plt.show() - - - -.. image:: segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_16_0.png - - -.. code:: ipython3 - - predictor.set_image(image) - - masks, scores, logits = predictor.predict(point_coords=input_point, point_labels=input_label, multimask_output=False) - -.. code:: ipython3 - - from ov_sam2_helper import show_masks - - show_masks(image, masks, point_coords=input_point, input_labels=input_label) - - - -.. image:: segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_18_0.png - - -.. code:: ipython3 - - predictor.reset_predictor() - -Convert model to OpenVINO Intermediate Representation ------------------------------------------------------ - - - -We split model on 2 independent parts: image_encoder and mask_predictor, -where mask_predictor is combination of Prompt Encoder and Mask Decoder. - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Image Encoder -~~~~~~~~~~~~~ - - - -The image encoder is only run once for the entire interaction and its -role is to provide unconditioned tokens (feature embeddings) -representing each frame. Then changing prompt, prompt encoder and mask -decoder can be run multiple times to retrieve different objects from the -same image. - -Image Encoder input is tensor with shape ``1x3x1024x1024`` in ``NCHW`` -format, contains image for segmentation. Image Encoder output is image -embeddings, tensor with shape ``1x256x64x64`` and two tensors of high -resolution features with shapes ``1x32x256x256`` and ``1x64x128x128``. - -To learn more about conversion of Image Encoder, please, see -``SamImageEncoderModel`` from ``ov_sam2_helper.py``. - -.. code:: ipython3 - - import warnings - import torch - from ov_sam2_helper import SamImageEncoderModel - - ov_encoder_path = Path("ov_image_encoder.xml") - if not ov_encoder_path.exists(): - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) - warnings.filterwarnings("ignore", category=UserWarning) - - image_encoder = SamImageEncoderModel(predictor) - ov_encoder_model = ov.convert_model( - image_encoder, - example_input=torch.zeros(1, 3, 1024, 1024), - input=([1, 3, 1024, 1024],), - ) - ov.save_model(ov_encoder_model, ov_encoder_path) - else: - ov_encoder_model = core.read_model(ov_encoder_path) - -.. code:: ipython3 - - ov_encoder = core.compile_model(ov_encoder_model, device.value) - -Mask predictor -~~~~~~~~~~~~~~ - - - -Mask prediction will be includes two models: - -* **Prompt Encoder** - Encoder for segmentation condition. As a condition can be used points, - boxes or segmentation mask. -* **Mask Decoder** - The mask decoder - efficiently maps the image embedding, prompt embeddings, and an output - token to a mask. - -Combined prompt encoder and mask decoder model has following list of -inputs: - -- ``image_embeddings``: The image embedding from ``image_encoder``. Has - a batch index of length 1. -- ``high_res_feats_256``: The high resolution features from - ``image_encoder``. Has a batch index of length 1. -- ``high_res_feats_128``: The high resolution features from - ``image_encoder``. Has a batch index of length 1. -- ``point_coords``: Coordinates of sparse input prompts, corresponding - to both point inputs and box inputs. Boxes are encoded using two - points, one for the top-left corner and one for the bottom-right - corner. *Coordinates must already be transformed to long-side 1024.* - Has a batch index of length 1. -- ``point_labels``: Labels for the sparse input prompts. 0 is a - negative input point, 1 is a positive input point, 2 is a top-left - box corner, 3 is a bottom-right box corner, and -1 is a padding - point. \*If there is no box input, a single padding point with label - -1 and coordinates (0.0, 0.0) should be concatenated. - -Model outputs: - -- ``masks`` - predicted masks resized to original image size, to obtain - a binary mask, should be compared with ``threshold`` (usually equal - 0.0). -- ``iou_predictions`` - intersection over union predictions -- ``low_res_masks`` - predicted masks before postprocessing, can be - used as mask input for model. - -Note that we use the ``multimask_output=False`` option when converting. -This way, the SAM2 model returns a single mask that it define as the -best variant. You can also avoid using use_high_res_features by passing -``use_high_res_features=False`` in ``SamImageMaskPredictionModel``. You -can find more details about conversion of Mask Predictor in -``SamImageMaskPredictionModel`` from ``ov_sam2_helper.py``. - -.. code:: ipython3 - - from ov_sam2_helper import SamImageMaskPredictionModel - - - ov_mask_predictor_path = Path("ov_mask_predictor.xml") - if not ov_mask_predictor_path.exists(): - exportable_model = SamImageMaskPredictionModel(predictor.model, multimask_output=False) - embed_dim = predictor.model.sam_prompt_encoder.embed_dim - embed_size = predictor.model.sam_prompt_encoder.image_embedding_size - - hf_sizes = predictor._bb_feat_sizes - - dummy_inputs = { - "image_embeddings": torch.randn(1, embed_dim, *embed_size, dtype=torch.float), - "high_res_feats_256": torch.randn(1, 32, *hf_sizes[0], dtype=torch.float), - "high_res_feats_128": torch.randn(1, 64, *hf_sizes[1], dtype=torch.float), - "point_coords": torch.randint(low=0, high=1024, size=(1, 5, 2), dtype=torch.float), - "point_labels": torch.randint(low=0, high=4, size=(1, 5), dtype=torch.float), - } - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) - warnings.filterwarnings("ignore", category=UserWarning) - ov_mask_predictor_model = ov.convert_model(exportable_model, example_input=dummy_inputs) - ov.save_model(ov_mask_predictor_model, ov_mask_predictor_path) - else: - ov_mask_predictor_model = core.read_model(ov_mask_predictor_path) - -.. code:: ipython3 - - ov_predictor = core.compile_model(ov_mask_predictor_model, device.value) - -Run OpenVINO model in interactive segmentation mode ---------------------------------------------------- - - - -Example Image -~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - import numpy as np - import cv2 - import matplotlib.pyplot as plt - - download_file("https://raw.githubusercontent.com/facebookresearch/segment-anything/main/notebooks/images/truck.jpg") - image = cv2.imread("truck.jpg") - image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - - -.. parsed-literal:: - - 'truck.jpg' already exists. - - -.. code:: ipython3 - - plt.figure(figsize=(10, 10)) - plt.imshow(image) - plt.axis("off") - plt.show() - - - -.. image:: segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_33_0.png - - -Preprocessing and visualization utilities -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To prepare input for Image Encoder we should: - -1. Convert BGR image to RGB -2. Resize image saving aspect ratio where longest size equal to Image - Encoder input size - 1024. -3. Normalize image subtract mean values (123.675, 116.28, 103.53) and - divide by std (58.395, 57.12, 57.375) -4. Transpose HWC data layout to CHW and add batch dimension. -5. Add zero padding to input tensor by height or width (depends on - aspect ratio) according Image Encoder expected input shape. - -These steps are applicable to all available models - -.. code:: ipython3 - - from ov_sam2_helper import ResizeLongestSide, preprocess_image, postprocess_masks - - resizer = ResizeLongestSide(1024) - -Image encoding -~~~~~~~~~~~~~~ - - - -To start work with image, we should preprocess it and obtain image -embeddings using ``ov_encoder``. We will use the same image for all -experiments, so it is possible to generate image embedding once and then -reuse them. - -.. code:: ipython3 - - preprocessed_image = preprocess_image(image, resizer) - encoding_results = ov_encoder(preprocessed_image) - -Now, we can try to provide different prompts for mask generation - -Example point input -~~~~~~~~~~~~~~~~~~~ - - - -In this example we select one point. The green star symbol show its -location on the image below. - -.. code:: ipython3 - - input_point = np.array([[500, 375]]) - input_label = np.array([1]) - - plt.figure(figsize=(10, 10)) - plt.imshow(image) - show_points(input_point, input_label, plt.gca()) - plt.axis("off") - plt.show() - - - -.. image:: segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_39_0.png - - -Add a batch index, concatenate a padding point, and transform it to -input tensor coordinate system. - -.. code:: ipython3 - - coord = np.concatenate([input_point, np.array([[0.0, 0.0]])], axis=0)[None, :, :] - label = np.concatenate([input_label, np.array([-1])], axis=0)[None, :].astype(np.float32) - coord = resizer.apply_coords(coord, image.shape[:2]).astype(np.float32) - -Package the inputs to run in the mask predictor. - -.. code:: ipython3 - - inputs = { - "image_embeddings": torch.from_numpy(encoding_results[ov_encoder.output(0)]), - "high_res_feats_256": torch.from_numpy(encoding_results[ov_encoder.output(1)]), - "high_res_feats_128": torch.from_numpy(encoding_results[ov_encoder.output(2)]), - "point_coords": coord, - "point_labels": label, - } - -Predict a mask and threshold it to get binary mask (0 - no object, 1 - -object). - -.. code:: ipython3 - - results = ov_predictor(inputs) - - masks = results[ov_predictor.output(0)] - masks = postprocess_masks(masks, image.shape[:-1], resizer) - - masks = masks > 0.0 - -.. code:: ipython3 - - mask = masks[0] - mask = np.transpose(mask, (1, 2, 0)) - -.. code:: ipython3 - - from ov_sam2_helper import show_mask - - plt.figure(figsize=(10, 10)) - plt.imshow(image) - show_mask(masks, plt.gca()) - show_points(input_point, input_label, plt.gca()) - plt.axis("off") - plt.show() - - - -.. image:: segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_47_0.png - - -Example with multiple points -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -in this example, we provide additional point for cover larger object -area. - -.. code:: ipython3 - - input_point = np.array([[500, 375], [1125, 625], [575, 750], [1405, 575]]) - input_label = np.array([1, 1, 1, 1]) - -Now, prompt for model looks like represented on this image: - -.. code:: ipython3 - - plt.figure(figsize=(10, 10)) - plt.imshow(image) - show_points(input_point, input_label, plt.gca()) - plt.axis("off") - plt.show() - - - -.. image:: segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_51_0.png - - -Transform the points as in the previous example. - -.. code:: ipython3 - - coord = np.concatenate([input_point, np.array([[0.0, 0.0]])], axis=0)[None, :, :] - label = np.concatenate([input_label, np.array([-1])], axis=0)[None, :].astype(np.float32) - - coord = resizer.apply_coords(coord, image.shape[:2]).astype(np.float32) - -Package inputs, then predict and threshold the mask. - -.. code:: ipython3 - - inputs = { - "image_embeddings": encoding_results[ov_encoder.output(0)], - "high_res_feats_256": encoding_results[ov_encoder.output(1)], - "high_res_feats_128": encoding_results[ov_encoder.output(2)], - "point_coords": coord, - "point_labels": label, - } - - results = ov_predictor(inputs) - - masks = results[ov_predictor.output(0)] - masks = postprocess_masks(masks, image.shape[:-1], resizer) - masks = masks > 0.0 - -.. code:: ipython3 - - plt.figure(figsize=(10, 10)) - plt.imshow(image) - show_mask(masks, plt.gca()) - show_points(input_point, input_label, plt.gca()) - plt.axis("off") - plt.show() - - - -.. image:: segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_56_0.png - - -Great! Looks like now, predicted mask cover whole truck. - -Example box and point input with negative label -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -In this example we define input prompt using bounding box and point -inside it.The bounding box represented as set of points of its left -upper corner and right lower corner. Label 0 for point speak that this -point should be excluded from mask. - -.. code:: ipython3 - - input_box = np.array([425, 600, 700, 875]) - input_point = np.array([[575, 750]]) - input_label = np.array([0]) - -.. code:: ipython3 - - from ov_sam2_helper import show_box - - plt.figure(figsize=(10, 10)) - plt.imshow(image) - show_box(input_box, plt.gca()) - show_points(input_point, input_label, plt.gca()) - plt.axis("off") - plt.show() - - - -.. image:: segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_60_0.png - - -Add a batch index, concatenate a box and point inputs, add the -appropriate labels for the box corners, and transform. There is no -padding point since the input includes a box input. - -.. code:: ipython3 - - box_coords = input_box.reshape(2, 2) - box_labels = np.array([2, 3]) - - coord = np.concatenate([input_point, box_coords], axis=0)[None, :, :] - label = np.concatenate([input_label, box_labels], axis=0)[None, :].astype(np.float32) - - coord = resizer.apply_coords(coord, image.shape[:2]).astype(np.float32) - -Package inputs, then predict and threshold the mask. - -.. code:: ipython3 - - inputs = { - "image_embeddings": encoding_results[ov_encoder.output(0)], - "high_res_feats_256": encoding_results[ov_encoder.output(1)], - "high_res_feats_128": encoding_results[ov_encoder.output(2)], - "point_coords": coord, - "point_labels": label, - } - - results = ov_predictor(inputs) - - masks = results[ov_predictor.output(0)] - masks = postprocess_masks(masks, image.shape[:-1], resizer) - masks = masks > 0.0 - -.. code:: ipython3 - - plt.figure(figsize=(10, 10)) - plt.imshow(image) - show_mask(masks[0], plt.gca()) - show_box(input_box, plt.gca()) - show_points(input_point, input_label, plt.gca()) - plt.axis("off") - plt.show() - - - -.. image:: segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_65_0.png - - -Interactive segmentation with Gradio ------------------------------------- - - - -Now, you can try SAM on own image. Upload image to input window and -click on desired point, model predict segment based on your image and -point. - -.. code:: ipython3 - - class Segmenter: - def __init__(self, ov_encoder, ov_predictor): - self.encoder = ov_encoder - self.predictor = ov_predictor - self._img_embeddings = None - self._high_res_features_256 = None - self._high_res_features_128 = None - - def set_image(self, img: np.ndarray): - if self._img_embeddings is not None: - del self._img_embeddings - preprocessed_image = preprocess_image(img, resizer) - encoding_results = self.encoder(preprocessed_image) - image_embeddings = encoding_results[ov_encoder.output(0)] - self._img_embeddings = image_embeddings - self._high_res_features_256 = encoding_results[ov_encoder.output(1)] - self._high_res_features_128 = encoding_results[ov_encoder.output(2)] - return img - - def get_mask(self, points, img): - coord = np.array(points) - coord = np.concatenate([coord, np.array([[0, 0]])], axis=0) - coord = coord[None, :, :] - label = np.concatenate([np.ones(len(points)), np.array([-1])], axis=0)[None, :].astype(np.float32) - coord = resizer.apply_coords(coord, img.shape[:2]).astype(np.float32) - if self._img_embeddings is None: - self.set_image(img) - inputs = { - "image_embeddings": self._img_embeddings, - "high_res_feats_256": self._high_res_features_256, - "high_res_feats_128": self._high_res_features_128, - "point_coords": coord, - "point_labels": label, - } - - results = self.predictor(inputs) - masks = results[ov_predictor.output(0)] - masks = postprocess_masks(masks, img.shape[:-1], resizer) - - masks = masks > 0.0 - mask = masks[0] - mask = np.transpose(mask, (1, 2, 0)) - return mask - - - segmenter = Segmenter(ov_encoder, ov_predictor) - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/segment-anything/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(segmenter) - - try: - demo.launch() - except Exception: - demo.launch(share=True) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name="your server name", server_port="server port in int")` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ - -.. code:: ipython3 - - # please uncomment and run this cell for stopping gradio interface - # demo.close() - -Run OpenVINO model in automatic mask generation mode ----------------------------------------------------- - - - -Since SAM2 can efficiently process prompts, masks for the entire image -can be generated by sampling a large number of prompts over an image. -``automatic_mask_generation`` function implements this capability. It -works by sampling single-point input prompts in a grid over the image, -from each of which SAM can predict multiple masks. Then, masks are -filtered for quality and deduplicated using non-maximal suppression. -Additional options allow for further improvement of mask quality and -quantity, such as running prediction on multiple crops of the image or -postprocessing masks to remove small disconnected regions and holes. - -.. code:: ipython3 - - mask_generation_helper_file_name = "automatic_mask_generation_helper.py" - - if not Path(mask_generation_helper_file_name).exists(): - r = requests.get( - url=f"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/segment-anything/{mask_generation_helper_file_name}", - ) - open(mask_generation_helper_file_name, "w").write(r.text) - -.. code:: ipython3 - - from automatic_mask_generation_helper import AutomaticMaskGenerationHelper - - mask_generator_helper = AutomaticMaskGenerationHelper(resizer, ov_predictor, ov_encoder) - -There are several tunable parameters in automatic mask generation that -control how densely points are sampled and what the thresholds are for -removing low quality or duplicate masks. Additionally, generation can be -automatically run on crops of the image to get improved performance on -smaller objects, and post-processing can remove stray pixels and holes - -.. code:: ipython3 - - prediction = mask_generator_helper.automatic_mask_generation(image) - -``automatic_mask_generation`` returns a list over masks, where each mask -is a dictionary containing various data about the mask. These keys are: - -- ``segmentation`` : the mask -- ``area`` : the area of the mask in pixels -- ``bbox`` : the boundary box of the mask in XYWH format -- ``predicted_iou`` : the model’s own prediction for the quality of the - mask -- ``point_coords`` : the sampled input point that generated this mask -- ``stability_score`` : an additional measure of mask quality -- ``crop_box`` : the crop of the image used to generate this mask in - XYWH format - -.. code:: ipython3 - - print(f"Number of detected masks: {len(prediction)}") - print(f"Annotation keys: {prediction[0].keys()}") - - -.. parsed-literal:: - - Number of detected masks: 11 - Annotation keys: dict_keys(['segmentation', 'area', 'bbox', 'predicted_iou', 'point_coords', 'stability_score', 'crop_box']) - - -.. code:: ipython3 - - import PIL - from automatic_mask_generation_helper import draw_anns - - out = draw_anns(image, prediction) - cv2.imwrite("result.png", out[:, :, ::-1]) - - PIL.Image.open("result.png") - - - -.. parsed-literal:: - - 0%| | 0/11 [00:00`__ provides a suite of -advanced algorithms for Neural Networks inference optimization in -OpenVINO with minimal accuracy drop. - -Since encoder costing much more time than other parts in SAM2 inference -pipeline, we will use 8-bit quantization in post-training mode (without -the fine-tuning pipeline) to optimize encoder of SAM2. - -The optimization process contains the following steps: - -1. Create a Dataset for quantization. -2. Run ``nncf.quantize`` for getting an optimized model. -3. Serialize OpenVINO IR model, using the ``openvino.save_model`` - function. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - to_quantize = quantization_widget(False) - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - skip_kernel_extension_file_name = "skip_kernel_extension.py" - - if not Path(skip_kernel_extension_file_name).exists(): - r = requests.get( - url=f"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/{skip_kernel_extension_file_name}", - ) - open(skip_kernel_extension_file_name, "w").write(r.text) - - %load_ext skip_kernel_extension - - -.. parsed-literal:: - - The skip_kernel_extension extension is already loaded. To reload it, use: - %reload_ext skip_kernel_extension - - -Prepare a calibration dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Download COCO dataset. Since the dataset is used to calibrate the -model’s parameter instead of fine-tuning it, we don’t need to download -the label files. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from zipfile import ZipFile - - DATA_URL = "https://ultralytics.com/assets/coco128.zip" - OUT_DIR = Path(".") - - if not (OUT_DIR / "coco128/images/train2017").exists(): - download_file(DATA_URL, directory=OUT_DIR, show_progress=True) - with ZipFile("coco128.zip", "r") as zip_ref: - zip_ref.extractall(OUT_DIR) - - -.. parsed-literal:: - - 'coco128.zip' already exists. - - -Create an instance of the ``nncf.Dataset`` class that represents the -calibration dataset. For PyTorch, we can pass an instance of the -``torch.utils.data.DataLoader`` object. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import torch.utils.data as data - - - class COCOLoader(data.Dataset): - def __init__(self, images_path): - self.images = list(Path(images_path).iterdir()) - - def __getitem__(self, index): - image_path = self.images[index] - image = cv2.imread(str(image_path)) - image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - return image - - def __len__(self): - return len(self.images) - - - coco_dataset = COCOLoader(OUT_DIR / "coco128/images/train2017") - calibration_loader = torch.utils.data.DataLoader(coco_dataset) - -The transformation function is a function that takes a sample from the -dataset and returns data that can be passed to the model for inference. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - - import nncf - - - def transform_fn(image_data): - """ - Quantization transform function. Extracts and preprocess input data from dataloader item for quantization. - Parameters: - image_data: image data produced by DataLoader during iteration - Returns: - input_tensor: input data in Dict format for model quantization - """ - image = image_data.numpy() - processed_image = preprocess_image(np.squeeze(image), resizer) - return processed_image - - - calibration_dataset = nncf.Dataset(calibration_loader, transform_fn) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, openvino - - -Run quantization and serialize OpenVINO IR model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The ``nncf.quantize`` function provides an interface for model -quantization. It requires an instance of the OpenVINO Model and -quantization dataset. It is available for models in the following -frameworks: ``PyTorch``, ``TensorFlow 2.x``, ``ONNX``, and -``OpenVINO IR``. - -Optionally, some additional parameters for the configuration -quantization process (number of samples for quantization, preset, model -type, etc.) can be provided. ``model_type`` can be used to specify -quantization scheme required for specific type of the model. For -example, Transformer models such as SAM require a special quantization -scheme to preserve accuracy after quantization. To achieve a better -result, we will use a ``mixed`` quantization preset. It provides -symmetric quantization of weights and asymmetric quantization of -activations. - - **Note**: Model post-training quantization is time-consuming process. - Be patient, it can take several minutes depending on your hardware. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - model = core.read_model(ov_encoder_path) - quantized_model = nncf.quantize( - model, - calibration_dataset, - model_type=nncf.parameters.ModelType.TRANSFORMER, - subset_size=128, - ) - print("model quantization finished") - -.. code:: ipython3 - - ov_encoder_path_int8 = "ov_image_encoder_int8.xml" - -.. code:: ipython3 - - %%skip not $to_quantize.value - - ov.save_model(quantized_model, ov_encoder_path_int8) - -Validate Quantized Model Inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We can reuse the previous code to validate the output of ``INT8`` model. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - - # Load INT8 model and run pipeline again - ov_encoder_model_int8 = core.read_model(ov_encoder_path_int8) - ov_encoder_int8 = core.compile_model(ov_encoder_model_int8, device.value) - encoding_results = ov_encoder_int8(preprocessed_image) - image_embeddings = encoding_results[ov_encoder_int8.output(0)] - high_res_256 = encoding_results[ov_encoder_int8.output(1)] - high_res_128 = encoding_results[ov_encoder_int8.output(2)] - - input_point = np.array([[500, 375]]) - input_label = np.array([1]) - coord = np.concatenate([input_point, np.array([[0.0, 0.0]])], axis=0)[None, :, :] - label = np.concatenate([input_label, np.array([-1])], axis=0)[None, :].astype(np.float32) - - coord = resizer.apply_coords(coord, image.shape[:2]).astype(np.float32) - inputs = { - "image_embeddings": image_embeddings, - "high_res_feats_256": high_res_256, - "high_res_feats_128": high_res_128, - "point_coords": coord, - "point_labels": label, - } - results = ov_predictor(inputs) - - masks = results[ov_predictor.output(0)] - masks = postprocess_masks(masks, image.shape[:-1], resizer) - masks = masks > 0.0 - plt.figure(figsize=(10, 10)) - plt.imshow(image) - show_mask(masks, plt.gca()) - show_points(input_point, input_label, plt.gca()) - plt.axis("off") - plt.show() - - - -.. image:: segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_92_0.png - - -Run ``INT8`` model in automatic mask generation mode - -.. code:: ipython3 - - %%skip not $to_quantize.value - - - mask_generator_helper_int8 = AutomaticMaskGenerationHelper(resizer, ov_predictor, ov_encoder_int8) - - prediction = mask_generator_helper_int8.automatic_mask_generation(image) - out = draw_anns(image, prediction) - cv2.imwrite("result_int8.png", out[:, :, ::-1]) - PIL.Image.open("result_int8.png") - - - -.. parsed-literal:: - - 0%| | 0/5 [00:00`__ -to measure the inference performance of the ``FP32`` and ``INT8`` -models. - -.. code:: ipython3 - - if Path(ov_encoder_path).exists() and Path(ov_encoder_path_int8).exists(): - # Inference FP32 model (OpenVINO IR) - !benchmark_app -m $ov_encoder_path -d $device.value - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ WARNING ] Default duration 120 seconds is used for unknown device AUTO - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.3.0-16041-1e3b88e4e3f-releases/2024/3 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.3.0-16041-1e3b88e4e3f-releases/2024/3 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 135.25 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] image (node: image) : f32 / [...] / [1,3,1024,1024] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: aten::view/Reshape_2) : f32 / [...] / [1,256,64,64] - [ INFO ] ***NO_NAME*** (node: aten::view/Reshape_1) : f32 / [...] / [1,32,256,256] - [ INFO ] ***NO_NAME*** (node: aten::view/Reshape) : f32 / [...] / [1,64,128,128] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] image (node: image) : u8 / [N,C,H,W] / [1,3,1024,1024] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: aten::view/Reshape_2) : f32 / [...] / [1,256,64,64] - [ INFO ] ***NO_NAME*** (node: aten::view/Reshape_1) : f32 / [...] / [1,32,256,256] - [ INFO ] ***NO_NAME*** (node: aten::view/Reshape) : f32 / [...] / [1,64,128,128] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 12310.29 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'image'!. This input will be filled with random values! - [ INFO ] Fill input 'image' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 120000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 5245.18 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 84 iterations - [ INFO ] Duration: 138035.30 ms - [ INFO ] Latency: - [ INFO ] Median: 19586.42 ms - [ INFO ] Average: 19531.25 ms - [ INFO ] Min: 16426.16 ms - [ INFO ] Max: 21435.90 ms - [ INFO ] Throughput: 0.61 FPS - - -.. code:: ipython3 - - if Path(ov_encoder_path).exists() and Path(ov_encoder_path_int8).exists(): - # Inference INT8 model (OpenVINO IR) - !benchmark_app -m $ov_encoder_path_int8 -d $device.value - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ WARNING ] Default duration 120 seconds is used for unknown device AUTO - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.3.0-16041-1e3b88e4e3f-releases/2024/3 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.3.0-16041-1e3b88e4e3f-releases/2024/3 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 167.08 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] image (node: image) : f32 / [...] / [1,3,1024,1024] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: aten::view/Reshape_2) : f32 / [...] / [1,256,64,64] - [ INFO ] ***NO_NAME*** (node: aten::view/Reshape_1) : f32 / [...] / [1,32,256,256] - [ INFO ] ***NO_NAME*** (node: aten::view/Reshape) : f32 / [...] / [1,64,128,128] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] image (node: image) : u8 / [N,C,H,W] / [1,3,1024,1024] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: aten::view/Reshape_2) : f32 / [...] / [1,256,64,64] - [ INFO ] ***NO_NAME*** (node: aten::view/Reshape_1) : f32 / [...] / [1,32,256,256] - [ INFO ] ***NO_NAME*** (node: aten::view/Reshape) : f32 / [...] / [1,64,128,128] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 14885.66 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'image'!. This input will be filled with random values! - [ INFO ] Fill input 'image' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 120000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 3194.85 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 120 iterations - [ INFO ] Duration: 136297.32 ms - [ INFO ] Latency: - [ INFO ] Median: 13600.36 ms - [ INFO ] Average: 13575.21 ms - [ INFO ] Min: 10246.83 ms - [ INFO ] Max: 15038.88 ms - [ INFO ] Throughput: 0.88 FPS - diff --git a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_12_0.png b/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_12_0.png deleted file mode 100644 index 28c85e1bb32ad8..00000000000000 --- a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_12_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:42ab1a0ee1d3c5314f0949ddf3ff099e782c466f25555a53e8423b61cb8fa8b4 -size 478882 diff --git a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_16_0.png b/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_16_0.png deleted file mode 100644 index 56ed35fc51b183..00000000000000 --- a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_16_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:56d781e32de5f4ea89d35d83b212136e4045cee5a2bfc4b91023c37b12a65312 -size 479883 diff --git a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_18_0.png b/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_18_0.png deleted file mode 100644 index 978a2b87dfa203..00000000000000 --- a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_18_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:69535341ecdf717273461af9f54e3a479969e42e7fb17d553b9b39189de239c2 -size 469791 diff --git a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_33_0.png b/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_33_0.png deleted file mode 100644 index ef76c145d3d8a4..00000000000000 --- a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_33_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:232d1ece544ecb239b82cdc3972f7275b9d270f3ac3200823c60c85c2aff4560 -size 467418 diff --git a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_39_0.png b/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_39_0.png deleted file mode 100644 index 7f8306f25686bd..00000000000000 --- a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_39_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:64e949b99f9b56e90257363f088dce01ea5d660970c3d305444563e970f970e3 -size 468529 diff --git a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_47_0.png b/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_47_0.png deleted file mode 100644 index 6689d5b1221a56..00000000000000 --- a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_47_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:af6830c93d79724f2d93c643478a0c6ccee7250061a9a25dd37b9d2bc2c6fb5d -size 469401 diff --git a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_51_0.png b/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_51_0.png deleted file mode 100644 index 34fef4c69e00a2..00000000000000 --- a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_51_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:50bdbf9f8abdd3f839c75b38daeea9c431724e73dfdbcb91dd2ebd43d83a8540 -size 470668 diff --git a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_56_0.png b/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_56_0.png deleted file mode 100644 index 19ee00ce503a51..00000000000000 --- a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_56_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b8d6b3fdf0e582d42a0252b4d37fde443d94bbaa9f39649b08f937a2783dae70 -size 466506 diff --git a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_60_0.png b/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_60_0.png deleted file mode 100644 index a807e8e38ad601..00000000000000 --- a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_60_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:373e18d7528a781128445f4664b7f68819f573fe2f9c02e08b2aed7883375787 -size 468088 diff --git a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_65_0.png b/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_65_0.png deleted file mode 100644 index 823fb7a6ecc72f..00000000000000 --- a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_65_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bc449a7c9939ed23eabe0d5cd646d34fd0b27d4a138840692ce094dcf5f4cf68 -size 470578 diff --git a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_77_1.jpg b/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_77_1.jpg deleted file mode 100644 index 971cf6e3426b61..00000000000000 --- a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_77_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c9d8fc62d5e03d83efbcb463cf2bbfeb646d54c05fb7ae257b540b8d75de0234 -size 260563 diff --git a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_77_1.png b/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_77_1.png deleted file mode 100644 index f11ab557c89b6f..00000000000000 --- a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_77_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61690a72440b65bfb5f4d6acc7783d98a820e54205ade3fb99b588b005bd826e -size 2461384 diff --git a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_92_0.png b/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_92_0.png deleted file mode 100644 index 343e5ecc49fc50..00000000000000 --- a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_92_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:520c7390de98864c4ae6b24b940230e83f2b5fc0b1723d58ed9941cc2d9bc70f -size 469439 diff --git a/docs/notebooks/segment-anything-2-video-with-output.rst b/docs/notebooks/segment-anything-2-video-with-output.rst deleted file mode 100644 index 3533b2d84120be..00000000000000 --- a/docs/notebooks/segment-anything-2-video-with-output.rst +++ /dev/null @@ -1,887 +0,0 @@ -Object masks from prompts with SAM2 and OpenVINO -================================================ - -.. warning:: - - Important note: This notebook requires python >= 3.10. Please make - sure that your environment fulfill to this requirement before running - it - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Load the SAM2 PyTorch model <#load-the-sam2-pytorch-model>`__ -- `Convert Model to OpenVINO IR <#convert-model-to-openvino-ir>`__ - - - `Image Encoder Model <#image-encoder-model>`__ - - `Mask Prediction Model <#mask-prediction-model>`__ - - `Memory Encoder Model <#memory-encoder-model>`__ - - `Memory Attention Model <#memory-attention-model>`__ - - `Infer video prediction model in OpenVINO IR - format <#infer-video-prediction-model-in-openvino-ir-format>`__ - -- `Prepare data <#prepare-data>`__ - - - `Example with points <#example-with-points>`__ - - `Example with box <#example-with-box>`__ - - `Run Interactive For Video Segmentation with - Gradio <#run-interactive-for-video-segmentation-with-gradio>`__ - -Segmentation - identifying which image pixels belong to an object - is a -core task in computer vision and is used in a broad array of -applications, from analyzing scientific imagery to editing photos. But -creating an accurate segmentation model for specific tasks typically -requires highly specialized work by technical experts with access to AI -training infrastructure and large volumes of carefully annotated -in-domain data. Reducing the need for task-specific modeling expertise, -training compute, and custom data annotation for image segmentation is -the main goal of the Segment Anything project. - -`Segment Anything Model 2 (SAM 2) `__ is a -foundation model towards solving promptable visual segmentation in -images and videos. It extend SAM to video by considering images as a -video with a single frame. The SAM 2 model extends the promptable -capability of SAM to the video domain by adding a per session memory -module that captures information about the target object in the video. -This allows SAM 2 to track the selected object throughout all video -frames, even if the object temporarily disappears from view, as the -model has context of the object from previous frames. SAM 2 also -supports the ability to make corrections in the mask prediction based on -additional prompts on any frame. SAM 2’s streaming architecture—which -processes video frames one at a time—is also a natural generalization of -SAM to the video domain. When SAM 2 is applied to images, the memory -module is empty and the model behaves like SAM. - -The model design is a simple transformer architecture with streaming -memory for real-time video processing. The model is built a -model-in-the-loop data engine, which improves model and data via user -interaction, to collect SA-V dataset, the largest video segmentation -dataset to date. SAM 2 provides strong performance across a wide range -of tasks and visual domains. This notebook shows an example of how to -convert and use Segment Anything Model 2 in OpenVINO format, allowing it -to run on a variety of platforms that support an OpenVINO. - -The diagram below demonstrates the SAM 2 architecture. For a given -frame, the segmentation prediction is conditioned on the current prompt -and/or on previously observed memories. Videos are processed in a -streaming fashion with frames being consumed one at a time by the image -encoder, and cross-attended to memories of the target object from -previous frames. The mask decoder, which optionally also takes input -prompts, predicts the segmentation mask for that frame. Finally, a -memory encoder transforms the prediction and image encoder embeddings -(not shown in the figure) for use in future frames - -.. figure:: https://raw.githubusercontent.com/facebookresearch/segment-anything-2/main/assets/model_diagram.png - :alt: model_diagram - - model_diagram - -More details about approach can be found in the -`paper `__, -original -`repo `__ and -`Meta AI blog post `__ - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "gradio>=4.13" "openvino>=2024.4.0" "nncf>=2.13" "torch>=2.3.1" "torchvision>=0.18.1" tqdm numpy --extra-index-url https://download.pytorch.org/whl/cpu - %pip install "iopath>=0.1.10" "pillow>=9.4.0" "hydra-core>=1.3.2" - - %pip install -q "matplotlib>=3.4" - %pip uninstall -q -y opencv-python opencv-python-headless - %pip install -q opencv-python - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - - from notebook_utils import download_file - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("segment-anything-2-video.ipynb") - -Clone and install segment-anything-2 - -.. code:: ipython3 - - from cmd_helper import clone_repo - - - clone_repo("https://github.com/facebookresearch/sam2.git", revision="0db838b11726893f151fa5826ecfa744e1a7760f") - - - - -.. parsed-literal:: - - PosixPath('sam2') - - - -.. code:: ipython3 - - %env SAM2_BUILD_CUDA=0 - %cd sam2 - %pip install -q -e . - %cd .. - -.. code:: ipython3 - - ov_sam2_helper_file_name = "ov_sam2_helper.py" - - if not Path(ov_sam2_helper_file_name).exists(): - r = requests.get( - url=f"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/sam2-video-segmentation/{ov_sam2_helper_file_name}", - ) - open(ov_sam2_helper_file_name, "w").write(r.text) - -.. code:: ipython3 - - img_helper_file_name = "image_helper.py" - - if not Path(ov_sam2_helper_file_name).exists(): - r = requests.get( - url=f"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/sam2-video-segmentation/{img_helper_file_name}", - ) - open(img_helper_file_name, "w").write(r.text) - -Load the SAM2 PyTorch model ---------------------------- - - - -SAM2 can be seen as a generalization of SAM to the video (and image) -domain. SAM 2 supports point, box, and mask prompts on individual frames -to define the spatial extent of the object to be segmented across the -video. For image input, the model behaves similarly to SAM. A promptable -and light-weight mask decoder accepts a frame embedding and prompts (if -any) on the current frame and outputs a segmentation mask for the frame. -Prompts can be iteratively added on a frame in order to refine the -masks. Unlike SAM, the frame embedding used by the SAM 2 decoder is not -directly from an image encoder and is instead conditioned on memories of -past predictions and prompted frames. It is possible for prompted frames -to also come “from the future” relative to the current frame. Memories -of frames are created by the memory encoder based on the current -prediction and placed in a memory bank for use in subsequent frames. The -memory attention operation takes the per-frame embedding from the image -encoder and conditions it on the memory bank to produce an embedding -that is then passed to the mask decoder - -For promptable segmentation and tracking in videos, SAM2 provides a -video predictor with APIs for example to add prompts and propagate mask -throughout a video. SAM 2 supports video inference on multiple objects -and uses an inference state to keep track of the interactions in each -video - -We will use ``SAM2VideoPredictor`` class to load the model -``sam2-hiera-large`` from -`HuggingFace `__. List -of other SAM2 models could be found -`here `__. - -.. code:: ipython3 - - from sam2.sam2_video_predictor import SAM2VideoPredictor - - predictor_video = SAM2VideoPredictor.from_pretrained("facebook/sam2-hiera-large", device="cpu") - -Convert Model to OpenVINO IR ----------------------------- - - - -We split model on 4 independent parts: image_encoder, mask_predictor, -which combined of Prompt Encoder and Mask Decoder, memory encoder model -and memory attention model. - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Load the parameters for model ``sam2_hiera_large`` from config. - -Image Encoder Model -~~~~~~~~~~~~~~~~~~~ - - - -The image encoder is only run once for the entire interaction and its -role is to provide unconditioned tokens (feature embeddings) -representing each frame. - -Image Encoder input is tensor with shape ``1x3x1024x1024`` in ``NCHW`` -format, contains image for segmentation. Image Encoder output is visual -features, which includes: - ``backbone_fpn``: image embeddings as tensor -with shape ``1x256x64x64`` and high resolution image embeddings as -tensors with shape ``1x32x256x256``, ``1x64x128x128`` - -``vision_pos_enc`` - ``vision_features`` - -.. code:: ipython3 - - from pathlib import Path - import warnings - import torch - from ov_sam2_helper import SamVideoFrameEncoderModel - - ov_video_frame_encoder_path = Path("ov_video_image_encoder.xml") - if not ov_video_frame_encoder_path.exists(): - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) - warnings.filterwarnings("ignore", category=UserWarning) - - video_frame_encoder_video = SamVideoFrameEncoderModel(predictor_video) - ov_video_frame_encoder_model = ov.convert_model( - video_frame_encoder_video, - example_input=torch.zeros(1, 3, 1024, 1024), - input=([1, 3, 1024, 1024],), - ) - ov.save_model(ov_video_frame_encoder_model, ov_video_frame_encoder_path) - else: - ov_video_frame_encoder_model = core.read_model(ov_video_frame_encoder_path) - -.. code:: ipython3 - - ov_video_frame_encoder_compl_model = core.compile_model(ov_video_frame_encoder_model, device.value) - -Mask Prediction Model -~~~~~~~~~~~~~~~~~~~~~ - - - -Mask prediction will be includes two models: - -- **Prompt Encoder** - Encoder for segmentation condition. As a - condition can be used points, boxes or segmentation mask. -- **Mask Decoder** - The mask decoder efficiently maps the image - embedding, prompt embeddings, and an output token to a mask. - -Combined prompt encoder and mask decoder model has following list of -inputs: - ``image_embeddings``: The image embedding from image_encoder. -Has a batch index of length 1. - ``high_res_feats_256``: The high -resolution embedding from image_encoder. Has a batch index of length 1. -- ``high_res_feats_128``: The high resolution embedding from -image_encoder. Has a batch index of length 1. - ``point_coords``: -Coordinates of sparse input prompts, corresponding to both point inputs -and box inputs. Boxes are encoded using two points, one for the top-left -corner and one for the bottom-right corner. Has a batch index of length -1. - ``point_labels``: Labels for the sparse input prompts. 0 is a -negative input point, 1 is a positive input point, 2 is a top-left box -corner, 3 is a bottom-right box corner, and -1 is a padding point. \*If -there is no box input, a single padding point with label -1 and -coordinates (0.0, 0.0) should be concatenated. - -Model outputs: ``low_res_multimasks``, ``high_res_multimasks``, ``ious`` -- intersection over union predictions, ``low_res_masks`` - predicted -masks before postprocessing, can be used as mask input for model, -``high_res_masks``, ``obj_ptr``, ``object_score_logits`` - -.. code:: ipython3 - - from ov_sam2_helper import SamVideoMaskPredictorModel - - ov_video_mask_pred_path = Path("ov_video_mask_predictor.xml") - if not ov_video_mask_pred_path.exists(): - mask_pred_model = SamVideoMaskPredictorModel(predictor_video, multimask_output=False) - embed_dim = predictor_video.sam_prompt_encoder.embed_dim - embed_size = predictor_video.sam_prompt_encoder.image_embedding_size - - dummy_inputs = { - "backbone_features": torch.randn(1, embed_dim, *embed_size, dtype=torch.float), - "high_res_feats_256": torch.randn(1, 32, 256, 256, dtype=torch.float), - "high_res_feats_128": torch.randn(1, 64, 128, 128, dtype=torch.float), - "point_coords": torch.randint(low=0, high=1024, size=(1, 5, 2), dtype=torch.float), - "point_labels": torch.randint(low=0, high=4, size=(1, 5), dtype=torch.float), - } - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) - warnings.filterwarnings("ignore", category=UserWarning) - ov_video_mask_pred_model = ov.convert_model(mask_pred_model, example_input=dummy_inputs) - ov.save_model(ov_video_mask_pred_model, ov_video_mask_pred_path) - else: - ov_video_mask_pred_model = core.read_model(ov_video_mask_pred_path) - -.. code:: ipython3 - - ov_video_mask_pred_compl_model = core.compile_model(ov_video_mask_pred_model, device.value) - -Memory Encoder Model -~~~~~~~~~~~~~~~~~~~~ - - - -Predictions and embeddings from the image encoder for the frame are -processed by the memory encoder and saved for future use. It holds the -past predictions for the object and keeps track of prompts from the -prompt encoder for relevant frames. The memory encoder generates a -memory by down sampling the output mask using a convolutional module and -summing it element-wise with the unconditioned frame embedding from the -image-encoder, followed by light-weight convolutional layers to fuse the -information. - -Converted model has following list of inputs: ``pix_feat``, -``mask_for_mem``, ``skip_mask_sigmoid`` - -Model outputs: ``vision_features``, ``vision_pos_enc`` - -.. code:: ipython3 - - from ov_sam2_helper import SamVideoMemoryEncoderModel - - ov_video_mem_encoder_path = Path("ov_video_mem_encoder.xml") - if not ov_video_mem_encoder_path.exists(): - dummy_inputs = { - "pix_feat": torch.randn(1, 256, 64, 64, dtype=torch.float), - "mask_for_mem": torch.randn(1, 1, 1024, 1024, dtype=torch.float), - "skip_mask_sigmoid": torch.tensor(1), - } - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) - warnings.filterwarnings("ignore", category=UserWarning) - - video_mem_encoder_model = SamVideoMemoryEncoderModel(predictor_video) - ov_video_mem_encoder_model = ov.convert_model(video_mem_encoder_model, example_input=dummy_inputs) - ov.save_model(ov_video_mem_encoder_model, ov_video_mem_encoder_path) - else: - ov_video_mem_encoder_model = core.read_model(ov_video_mem_encoder_path) - -.. code:: ipython3 - - ov_video_mem_encoder_compl_model = core.compile_model(ov_video_mem_encoder_model, device.value) - -Memory Attention Model -~~~~~~~~~~~~~~~~~~~~~~ - - - -Memory attention aligns current frame features with past data such as -features of frames and predictions and prompts, arranging transformer -blocks where the first block extracts features. SAM2 stacks L -transformer blocks, the first one taking the image encoding from the -current frame as input. Each block performs self-attention, followed by -cross-attention to memories of (prompted/unprompted) frames and object -pointers (see below), stored in a memory bank (see below), followed by -an MLP. We use vanilla attention operations for self- and -cross-attention, allowing us to benefit from recent developments in -efficient attention kernels. - -Converted model has following list of inputs: - ``curr``: self-attention -inputs - ``memory``: cross-attention inputs - ``curr_pos``: ``pos_enc`` -for self-attention inputs - ``memory_pos``: ``pos_enc`` for -cross-attention inputs - ``num_obj_ptr_tokens``: number of object -pointer *tokens* - -Model outputs is ``normed_output`` - -.. code:: ipython3 - - import yaml - - sam2_config = "./sam2/sam2_configs/sam2_hiera_l.yaml" - - with open(sam2_config, "r") as f: - model_info = yaml.safe_load(f) - -Handling of complex tensors is supported only 2D rotations. So, let’s -update the memory attention model to being able convert it. - -.. code:: ipython3 - - from ov_sam2_helper import get_rotation_matrices, matrix_rope_forward - - # Matrix version of rotary enc - # https://github.com/facebookresearch/segment-anything-2/issues/186 - - import types - - funcType = types.MethodType - - for layer in predictor_video.memory_attention.layers: - self_attn_feat_sizes = model_info["model"]["memory_attention"]["layer"]["self_attention"]["feat_sizes"] - self_attn_rope_theta = model_info["model"]["memory_attention"]["layer"]["self_attention"]["rope_theta"] - self_attn_rotmats = get_rotation_matrices( - dim=layer.self_attn.internal_dim // layer.self_attn.num_heads, end_x=self_attn_feat_sizes[0], end_y=self_attn_feat_sizes[1], theta=self_attn_rope_theta - ) - setattr(layer.self_attn, "rotmats", self_attn_rotmats) - setattr(layer.self_attn, "rope_theta", self_attn_rope_theta) - layer.self_attn.forward = funcType(matrix_rope_forward, layer.self_attn) - - cross_attn_feat_sizes = model_info["model"]["memory_attention"]["layer"]["self_attention"]["feat_sizes"] - cross_attn_rope_theta = model_info["model"]["memory_attention"]["layer"]["self_attention"]["rope_theta"] - cross_attn_rotmats = get_rotation_matrices( - dim=layer.cross_attn_image.internal_dim // layer.cross_attn_image.num_heads, - end_x=cross_attn_feat_sizes[0], - end_y=cross_attn_feat_sizes[1], - theta=cross_attn_rope_theta, - ) - setattr(layer.cross_attn_image, "rotmats", cross_attn_rotmats) - setattr(layer.cross_attn_image, "rope_theta", cross_attn_rope_theta) - layer.cross_attn_image.forward = funcType(matrix_rope_forward, layer.cross_attn_image) - -.. code:: ipython3 - - from ov_sam2_helper import SamVideoMemoryAttentionModel - - ov_memory_attention_path = Path("ov_memory_attention_model.xml") - if not ov_memory_attention_path.exists(): - dummy_inputs = { - "curr": torch.zeros(4096, 1, 256), - "curr_pos": torch.zeros(4096, 1, 256), - "memory": torch.zeros(4100, 1, 64), - "memory_pos": torch.zeros(4100, 1, 64), - "num_obj_ptr_tokens": torch.tensor(4), - } - - memory_attention = SamVideoMemoryAttentionModel(predictor_video) - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) - warnings.filterwarnings("ignore", category=UserWarning) - - ov_memory_attention_model = ov.convert_model( - memory_attention, - example_input=dummy_inputs, - ) - ov.save_model(ov_memory_attention_model, ov_memory_attention_path) - else: - ov_memory_attention_model = core.read_model(ov_memory_attention_path) - -.. code:: ipython3 - - ov_memory_attention_comp_model = core.compile_model(ov_memory_attention_model, device.value) - -Infer video prediction model in OpenVINO IR format -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To infer video we will use ``OVSAM2VideoPredictor``. -``OVSAM2VideoPredictor`` inherits ``SAM2VideoPredictor`` behavior, but -overloads some functions to use models converted to OpenVINO IR format. - -.. code:: ipython3 - - memory_encoder_out_proj_weight_shape = None - - if hasattr(predictor_video.memory_encoder, "out_proj") and hasattr(predictor_video.memory_encoder.out_proj, "weight"): - # if there is compression of memories along channel dim - memory_encoder_out_proj_weight_shape = predictor_video.memory_encoder.out_proj.weight.shape[0] - - setattr(ov_memory_attention_comp_model, "d_model", predictor_video.memory_attention.d_model) - -.. code:: ipython3 - - from ov_sam2_helper import OVSAM2VideoPredictor - - ov_predictor_video = OVSAM2VideoPredictor.from_pretrained( - model_info=model_info, - ov_image_encoder=ov_video_frame_encoder_compl_model, - ov_mask_encoder=ov_video_mask_pred_compl_model, - ov_memory_encoder=ov_video_mem_encoder_compl_model, - ov_memory_attention_model=ov_memory_attention_comp_model, - memory_encoder_out_proj_weight_shape=memory_encoder_out_proj_weight_shape, - ) - -Prepare data ------------- - - - -.. code:: ipython3 - - sample_path = Path("data") - sample_path.mkdir(parents=True, exist_ok=True) - - if not (sample_path / "coco.mp4").exists(): - download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/Coco%20Walking%20in%20Berkeley.mp4", - directory=sample_path, - filename="coco.mp4", - ) - - - -.. parsed-literal:: - - data/coco.mp4: 0%| | 0.00/877k [00:00 0.0).cpu().numpy() for i, out_obj_id in enumerate(out_obj_ids)} - - return self.video_segments - - def save_as_video(self, output_filename="results.mp4"): - frame_names = [p for p in os.listdir(self.work_dir_path) if os.path.splitext(p)[-1] in [".jpg", ".jpeg", ".JPG", ".JPEG"]] - frame_names.sort(key=lambda p: int(os.path.splitext(p)[0])) - - FOURCC = cv2.VideoWriter_fourcc(*"vp09") - out = cv2.VideoWriter(output_filename, FOURCC, 30, (int(self.width), int(self.height))) - - color = np.random.randint(0, 255, size=(1, 1, 3)) - - for out_frame_idx in range(len(frame_names)): - img = cv2.imread(os.path.join(self.work_dir_path, frame_names[out_frame_idx])) - - for out_obj_id, out_mask in self.video_segments[out_frame_idx].items(): - out_mask = np.transpose(out_mask, (1, 2, 0)) - - out_img = img.copy() - out_img[out_mask.squeeze(-1)] = color - result = cv2.addWeighted(img.astype(np.float32), 0.7, out_img.astype(np.float32), 0.3, 0) - - result = result.astype(np.uint8) - out.write(result) - - cv2.destroyAllWindows() - out.release() - - return output_filename - -Example with points -~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - sample_path = Path("data") - - segmenter_video = SegmenterVideo(ov_predictor_video, max_frame_size=25) - segmenter_video.set_video(sample_path / "coco.mp4") - - -.. parsed-literal:: - - frame loading (JPEG): 100%|█████████████████████| 25/25 [00:00<00:00, 54.94it/s] - - - - -.. parsed-literal:: - - PosixPath('data/coco.mp4') - - - - - -.. parsed-literal:: - - PosixPath('data/coco.mp4') - - - -.. code:: ipython3 - - from image_helper import show_points, show_mask - import matplotlib.pyplot as plt - from PIL import Image - - input_point = np.array([[430, 130], [490, 100], [480, 60]]) - input_label = np.array([1, 1, 0]) - - out_obj_ids, out_mask_logits = segmenter_video.add_new_points_or_box(input_point, input_label) - - frame_0 = Image.open(os.path.join(segmenter_video.work_dir_path, "0.jpg")) - plt.imshow(np.array(frame_0.convert("RGB"))) - show_points(np.array(input_point), np.array(input_label), plt.gca()) - show_mask((out_mask_logits[0] > 0.0).cpu().numpy(), plt.gca()) - plt.show() - - -.. parsed-literal:: - - /home/labuser/work/notebook/openvino_notebooks/notebooks/sam2-video-segmentation/sam2/sam2/sam2_video_predictor.py:873: UserWarning: cannot import name '_C' from 'sam2' (/home/labuser/work/notebook/openvino_notebooks/notebooks/sam2-video-segmentation/sam2/sam2/__init__.py) - - Skipping the post-processing step due to the error above. You can still use SAM 2 and it's OK to ignore the error above, although some post-processing functionality may be limited (which doesn't affect the results in most cases; see https://github.com/facebookresearch/segment-anything-2/blob/main/INSTALL.md). - pred_masks_gpu = fill_holes_in_mask_scores( - - - -.. image:: segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_40_1.png - - -.. code:: ipython3 - - res = segmenter_video.propagate_in_video() - - -.. parsed-literal:: - - propagate in video: 100%|███████████████████████| 25/25 [03:52<00:00, 9.30s/it] - propagate in video: 100%|██████████| 50/50 [08:12<00:00, 9.85s/it] - - -.. code:: ipython3 - - video_out_path = segmenter_video.save_as_video("point_results.mp4") - -.. code:: ipython3 - - import IPython - - IPython.display.Video(filename=video_out_path, embed=True) - - - - -.. raw:: html - - - - - -Example with box -~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - segmenter_video.set_video(sample_path / "coco.mp4") - - -.. parsed-literal:: - - frame loading (JPEG): 100%|█████████████████████| 25/25 [00:00<00:00, 55.18it/s] - - - - -.. parsed-literal:: - - PosixPath('data/coco.mp4') - - - -.. code:: ipython3 - - from image_helper import show_box - - input_box = np.array([385, 80, 525, 260]) - - out_obj_ids, out_mask_logits = segmenter_video.add_new_points_or_box(box=input_box) - - frame_0 = Image.open(os.path.join(segmenter_video.work_dir_path, "0.jpg")) - plt.imshow(np.array(frame_0.convert("RGB"))) - show_mask((out_mask_logits[0] > 0.0).cpu().numpy(), plt.gca()) - show_box(input_box, plt.gca()) - plt.show() - - - -.. image:: segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_46_0.png - - -.. code:: ipython3 - - res = segmenter_video.propagate_in_video() - - -.. parsed-literal:: - - propagate in video: 100%|███████████████████████| 25/25 [03:53<00:00, 9.33s/it] - - -.. code:: ipython3 - - video_out_path = segmenter_video.save_as_video("box_results.mp4") - -.. code:: ipython3 - - IPython.display.Video(filename=video_out_path, embed=True) - - - - -.. raw:: html - - - - - -Run Interactive For Video Segmentation with Gradio -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/segment-anything/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_video_demo - - segmenter = SegmenterVideo(ov_predictor_video) - demo = make_video_demo(segmenter, sample_path) - - try: - demo.launch(debug=True) - except Exception: - demo.launch(share=True, debug=True) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name="your server name", server_port="server port in int")` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ diff --git a/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_40_1.png b/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_40_1.png deleted file mode 100644 index 134c53854996bc..00000000000000 --- a/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_40_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:48a50a248dbaeb4c0eb98d220b5b6fb8846fdac4cfbedfbda61db2a9d4580769 -size 193591 diff --git a/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_46_0.png b/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_46_0.png deleted file mode 100644 index 4a5a6bc047a71a..00000000000000 --- a/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_46_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ce9761534ff992aaabc8a32a0c2e4e2135181e8fc3f4b513303babff98484c00 -size 190196 diff --git a/docs/notebooks/segment-anything-with-output.rst b/docs/notebooks/segment-anything-with-output.rst deleted file mode 100644 index c3d4cc340fa85b..00000000000000 --- a/docs/notebooks/segment-anything-with-output.rst +++ /dev/null @@ -1,1705 +0,0 @@ -Object masks from prompts with SAM and OpenVINO -=============================================== - - -**Table of contents:** - - -- `Background <#background>`__ -- `Prerequisites <#prerequisites>`__ -- `Convert model to OpenVINO Intermediate - Representation <#convert-model-to-openvino-intermediate-representation>`__ - - - `Download model checkpoint and create PyTorch - model <#download-model-checkpoint-and-create-pytorch-model>`__ - - `Image Encoder <#image-encoder>`__ - - `Mask predictor <#mask-predictor>`__ - -- `Run OpenVINO model in interactive segmentation - mode <#run-openvino-model-in-interactive-segmentation-mode>`__ - - - `Example Image <#example-image>`__ - - `Preprocessing and visualization - utilities <#preprocessing-and-visualization-utilities>`__ - - `Image encoding <#image-encoding>`__ - - `Example point input <#example-point-input>`__ - - `Example with multiple points <#example-with-multiple-points>`__ - - `Example box and point input with negative - label <#example-box-and-point-input-with-negative-label>`__ - -- `Interactive segmentation <#interactive-segmentation>`__ -- `Run OpenVINO model in automatic mask generation - mode <#run-openvino-model-in-automatic-mask-generation-mode>`__ -- `Optimize encoder using NNCF Post-training Quantization - API <#optimize-encoder-using-nncf-post-training-quantization-api>`__ - - - `Prepare a calibration dataset <#prepare-a-calibration-dataset>`__ - - `Run quantization and serialize OpenVINO IR - model <#run-quantization-and-serialize-openvino-ir-model>`__ - - `Validate Quantized Model - Inference <#validate-quantized-model-inference>`__ - - `Compare Performance of the Original and Quantized - Models <#compare-performance-of-the-original-and-quantized-models>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Segmentation - identifying which image pixels belong to an object - is a -core task in computer vision and is used in a broad array of -applications, from analyzing scientific imagery to editing photos. But -creating an accurate segmentation model for specific tasks typically -requires highly specialized work by technical experts with access to AI -training infrastructure and large volumes of carefully annotated -in-domain data. Reducing the need for task-specific modeling expertise, -training compute, and custom data annotation for image segmentation is -the main goal of the `Segment -Anything `__ project. - -The `Segment Anything Model -(SAM) `__ predicts -object masks given prompts that indicate the desired object. SAM has -learned a general notion of what objects are, and it can generate masks -for any object in any image or any video, even including objects and -image types that it had not encountered during training. SAM is general -enough to cover a broad set of use cases and can be used out of the box -on new image “domains” (e.g. underwater photos, MRI or cell microscopy) -without requiring additional training (a capability often referred to as -zero-shot transfer). This notebook shows an example of how to convert -and use Segment Anything Model in OpenVINO format, allowing it to run on -a variety of platforms that support an OpenVINO. - -Background ----------- - - - -Previously, to solve any kind of segmentation problem, there were two -classes of approaches. The first, interactive segmentation, allowed for -segmenting any class of object but required a person to guide the method -by iterative refining a mask. The second, automatic segmentation, -allowed for segmentation of specific object categories defined ahead of -time (e.g., cats or chairs) but required substantial amounts of manually -annotated objects to train (e.g., thousands or even tens of thousands of -examples of segmented cats), along with the compute resources and -technical expertise to train the segmentation model. Neither approach -provided a general, fully automatic approach to segmentation. - -Segment Anything Model is a generalization of these two classes of -approaches. It is a single model that can easily perform both -interactive segmentation and automatic segmentation. The Segment -Anything Model (SAM) produces high quality object masks from input -prompts such as points or boxes, and it can be used to generate masks -for all objects in an image. It has been trained on a -`dataset `__ of 11 -million images and 1.1 billion masks, and has strong zero-shot -performance on a variety of segmentation tasks. The model consists of 3 -parts: - -- **Image Encoder** - Vision Transformer model (VIT) pretrained using - Masked Auto Encoders approach (MAE) for encoding image to embedding - space. The image encoder runs once per image and can be applied prior - to prompting the model. -- **Prompt Encoder** - Encoder for segmentation condition. As a - condition can be used: - - - points - set of points related to object which should be - segmented. Prompt encoder converts points to embedding using - positional encoding. - - boxes - bounding box where object for segmentation is located. - Similar to points, coordinates of bounding box encoded via - positional encoding. - - segmentation mask - provided by user segmentation mask is embedded - using convolutions and summed element-wise with the image - embedding. - - text - encoded by CLIP model text representation - -- **Mask Decoder** - The mask decoder efficiently maps the image - embedding, prompt embeddings, and an output token to a mask. - -The diagram below demonstrates the process of mask generation using SAM: -|model_diagram| - -The model first converts the image into an image embedding that allows -high quality masks to be efficiently produced from a prompt. The model -returns multiple masks which fit to the provided prompt and its score. -The provided masks can be overlapped areas as it shown on diagram, it is -useful for complicated cases when prompt can be interpreted in different -manner, e.g. segment whole object or only its specific part or when -provided point at the intersection of multiple objects. The model’s -promptable interface allows it to be used in flexible ways that make a -wide range of segmentation tasks possible simply by engineering the -right prompt for the model (clicks, boxes, text, and so on). - -More details about approach can be found in the -`paper `__, original -`repo `__ and -`Meta AI blog -post `__ - -.. |model_diagram| image:: https://raw.githubusercontent.com/facebookresearch/segment-anything/main/assets/model_diagram.png - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "segment_anything" "gradio>=4.13" "openvino>=2023.1.0" "nncf>=2.7.0" "torch>=2.1" "torchvision>=0.16" Pillow opencv-python tqdm --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "matplotlib>=3.4" - -Convert model to OpenVINO Intermediate Representation ------------------------------------------------------ - - - -Download model checkpoint and create PyTorch model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -There are several Segment Anything Model -`checkpoints `__ -available for downloading In this tutorial we will use model based on -``vit_b``, but the demonstrated approach is very general and applicable -to other SAM models. Set the model URL, path for saving checkpoint and -model type below to a SAM model checkpoint, then load the model using -``sam_model_registry``. - -.. code:: ipython3 - - # Fetch `notebook_utils` module - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file, device_widget - - checkpoint = "sam_vit_b_01ec64.pth" - model_url = "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth" - model_type = "vit_b" - - if not Path(checkpoint).exists(): - download_file(model_url) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("segment-anything.ipynb") - -.. code:: ipython3 - - from segment_anything import sam_model_registry - - sam = sam_model_registry[model_type](checkpoint=checkpoint) - -As we already discussed, Image Encoder part can be used once per image, -then changing prompt, prompt encoder and mask decoder can be run -multiple times to retrieve different objects from the same image. Taking -into account this fact, we split model on 2 independent parts: -image_encoder and mask_predictor (combination of Prompt Encoder and Mask -Decoder). - -Image Encoder -~~~~~~~~~~~~~ - - - -Image Encoder input is tensor with shape ``1x3x1024x1024`` in ``NCHW`` -format, contains image for segmentation. Image Encoder output is image -embeddings, tensor with shape ``1x256x64x64`` - -.. code:: ipython3 - - import warnings - from pathlib import Path - import torch - import openvino as ov - - core = ov.Core() - - ov_encoder_path = Path("sam_image_encoder.xml") - if not ov_encoder_path.exists(): - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) - warnings.filterwarnings("ignore", category=UserWarning) - - ov_encoder_model = ov.convert_model( - sam.image_encoder, - example_input=torch.zeros(1, 3, 1024, 1024), - input=([1, 3, 1024, 1024],), - ) - ov.save_model(ov_encoder_model, ov_encoder_path) - else: - ov_encoder_model = core.read_model(ov_encoder_path) - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - ov_encoder = core.compile_model(ov_encoder_model, device.value) - -Mask predictor -~~~~~~~~~~~~~~ - - - -This notebook expects the model was exported with the parameter -``return_single_mask=True``. It means that model will only return the -best mask, instead of returning multiple masks. For high resolution -images this can improve runtime when upscaling masks is expensive. - -Combined prompt encoder and mask decoder model has following list of -inputs: - -- ``image_embeddings``: The image embedding from ``image_encoder``. Has - a batch index of length 1. -- ``point_coords``: Coordinates of sparse input prompts, corresponding - to both point inputs and box inputs. Boxes are encoded using two - points, one for the top-left corner and one for the bottom-right - corner. *Coordinates must already be transformed to long-side 1024.* - Has a batch index of length 1. -- ``point_labels``: Labels for the sparse input prompts. 0 is a - negative input point, 1 is a positive input point, 2 is a top-left - box corner, 3 is a bottom-right box corner, and -1 is a padding - point. \*If there is no box input, a single padding point with label - -1 and coordinates (0.0, 0.0) should be concatenated. - -Model outputs: - -- ``masks`` - predicted masks resized to original image size, to obtain - a binary mask, should be compared with ``threshold`` (usually equal - 0.0). -- ``iou_predictions`` - intersection over union predictions -- ``low_res_masks`` - predicted masks before postprocessing, can be - used as mask input for model. - -.. code:: ipython3 - - from typing import Tuple - - - class SamExportableModel(torch.nn.Module): - def __init__( - self, - model, - return_single_mask: bool, - use_stability_score: bool = False, - return_extra_metrics: bool = False, - ) -> None: - super().__init__() - self.mask_decoder = model.mask_decoder - self.model = model - self.img_size = model.image_encoder.img_size - self.return_single_mask = return_single_mask - self.use_stability_score = use_stability_score - self.stability_score_offset = 1.0 - self.return_extra_metrics = return_extra_metrics - - def _embed_points(self, point_coords: torch.Tensor, point_labels: torch.Tensor) -> torch.Tensor: - point_coords = point_coords + 0.5 - point_coords = point_coords / self.img_size - point_embedding = self.model.prompt_encoder.pe_layer._pe_encoding(point_coords) - point_labels = point_labels.unsqueeze(-1).expand_as(point_embedding) - - point_embedding = point_embedding * (point_labels != -1).to(torch.float32) - point_embedding = point_embedding + self.model.prompt_encoder.not_a_point_embed.weight * (point_labels == -1).to(torch.float32) - - for i in range(self.model.prompt_encoder.num_point_embeddings): - point_embedding = point_embedding + self.model.prompt_encoder.point_embeddings[i].weight * (point_labels == i).to(torch.float32) - - return point_embedding - - def t_embed_masks(self, input_mask: torch.Tensor) -> torch.Tensor: - mask_embedding = self.model.prompt_encoder.mask_downscaling(input_mask) - return mask_embedding - - def mask_postprocessing(self, masks: torch.Tensor) -> torch.Tensor: - masks = torch.nn.functional.interpolate( - masks, - size=(self.img_size, self.img_size), - mode="bilinear", - align_corners=False, - ) - return masks - - def select_masks(self, masks: torch.Tensor, iou_preds: torch.Tensor, num_points: int) -> Tuple[torch.Tensor, torch.Tensor]: - # Determine if we should return the multiclick mask or not from the number of points. - # The reweighting is used to avoid control flow. - score_reweight = torch.tensor([[1000] + [0] * (self.model.mask_decoder.num_mask_tokens - 1)]).to(iou_preds.device) - score = iou_preds + (num_points - 2.5) * score_reweight - best_idx = torch.argmax(score, dim=1) - masks = masks[torch.arange(masks.shape[0]), best_idx, :, :].unsqueeze(1) - iou_preds = iou_preds[torch.arange(masks.shape[0]), best_idx].unsqueeze(1) - - return masks, iou_preds - - @torch.no_grad() - def forward( - self, - image_embeddings: torch.Tensor, - point_coords: torch.Tensor, - point_labels: torch.Tensor, - mask_input: torch.Tensor = None, - ): - sparse_embedding = self._embed_points(point_coords, point_labels) - if mask_input is None: - dense_embedding = self.model.prompt_encoder.no_mask_embed.weight.reshape(1, -1, 1, 1).expand( - point_coords.shape[0], -1, image_embeddings.shape[0], 64 - ) - else: - dense_embedding = self._embed_masks(mask_input) - - masks, scores = self.model.mask_decoder.predict_masks( - image_embeddings=image_embeddings, - image_pe=self.model.prompt_encoder.get_dense_pe(), - sparse_prompt_embeddings=sparse_embedding, - dense_prompt_embeddings=dense_embedding, - ) - - if self.use_stability_score: - scores = calculate_stability_score(masks, self.model.mask_threshold, self.stability_score_offset) - - if self.return_single_mask: - masks, scores = self.select_masks(masks, scores, point_coords.shape[1]) - - upscaled_masks = self.mask_postprocessing(masks) - - if self.return_extra_metrics: - stability_scores = calculate_stability_score(upscaled_masks, self.model.mask_threshold, self.stability_score_offset) - areas = (upscaled_masks > self.model.mask_threshold).sum(-1).sum(-1) - return upscaled_masks, scores, stability_scores, areas, masks - - return upscaled_masks, scores - - - ov_model_path = Path("sam_mask_predictor.xml") - if not ov_model_path.exists(): - exportable_model = SamExportableModel(sam, return_single_mask=True) - embed_dim = sam.prompt_encoder.embed_dim - embed_size = sam.prompt_encoder.image_embedding_size - dummy_inputs = { - "image_embeddings": torch.randn(1, embed_dim, *embed_size, dtype=torch.float), - "point_coords": torch.randint(low=0, high=1024, size=(1, 5, 2), dtype=torch.float), - "point_labels": torch.randint(low=0, high=4, size=(1, 5), dtype=torch.float), - } - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) - warnings.filterwarnings("ignore", category=UserWarning) - ov_model = ov.convert_model(exportable_model, example_input=dummy_inputs) - ov.save_model(ov_model, ov_model_path) - else: - ov_model = core.read_model(ov_model_path) - -.. code:: ipython3 - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - ov_predictor = core.compile_model(ov_model, device.value) - -Run OpenVINO model in interactive segmentation mode ---------------------------------------------------- - - - -Example Image -~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - import numpy as np - import cv2 - import matplotlib.pyplot as plt - - download_file("https://raw.githubusercontent.com/facebookresearch/segment-anything/main/notebooks/images/truck.jpg") - image = cv2.imread("truck.jpg") - image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - - -.. parsed-literal:: - - 'truck.jpg' already exists. - - -.. code:: ipython3 - - plt.figure(figsize=(10, 10)) - plt.imshow(image) - plt.axis("off") - plt.show() - - - -.. image:: segment-anything-with-output_files/segment-anything-with-output_21_0.png - - -Preprocessing and visualization utilities -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To prepare input for Image Encoder we should: - -1. Convert BGR image to RGB -2. Resize image saving aspect ratio where longest size equal to Image - Encoder input size - 1024. -3. Normalize image subtract mean values (123.675, 116.28, 103.53) and - divide by std (58.395, 57.12, 57.375) -4. Transpose HWC data layout to CHW and add batch dimension. -5. Add zero padding to input tensor by height or width (depends on - aspect ratio) according Image Encoder expected input shape. - -These steps are applicable to all available models - -.. code:: ipython3 - - from copy import deepcopy - from typing import Tuple - from torchvision.transforms.functional import resize, to_pil_image - - - class ResizeLongestSide: - """ - Resizes images to longest side 'target_length', as well as provides - methods for resizing coordinates and boxes. Provides methods for - transforming numpy arrays. - """ - - def __init__(self, target_length: int) -> None: - self.target_length = target_length - - def apply_image(self, image: np.ndarray) -> np.ndarray: - """ - Expects a numpy array with shape HxWxC in uint8 format. - """ - target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length) - return np.array(resize(to_pil_image(image), target_size)) - - def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: - """ - Expects a numpy array of length 2 in the final dimension. Requires the - original image size in (H, W) format. - """ - old_h, old_w = original_size - new_h, new_w = self.get_preprocess_shape(original_size[0], original_size[1], self.target_length) - coords = deepcopy(coords).astype(float) - coords[..., 0] = coords[..., 0] * (new_w / old_w) - coords[..., 1] = coords[..., 1] * (new_h / old_h) - return coords - - def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: - """ - Expects a numpy array shape Bx4. Requires the original image size - in (H, W) format. - """ - boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size) - return boxes.reshape(-1, 4) - - @staticmethod - def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]: - """ - Compute the output size given input size and target long side length. - """ - scale = long_side_length * 1.0 / max(oldh, oldw) - newh, neww = oldh * scale, oldw * scale - neww = int(neww + 0.5) - newh = int(newh + 0.5) - return (newh, neww) - - - resizer = ResizeLongestSide(1024) - - - def preprocess_image(image: np.ndarray): - resized_image = resizer.apply_image(image) - resized_image = (resized_image.astype(np.float32) - [123.675, 116.28, 103.53]) / [ - 58.395, - 57.12, - 57.375, - ] - resized_image = np.expand_dims(np.transpose(resized_image, (2, 0, 1)).astype(np.float32), 0) - - # Pad - h, w = resized_image.shape[-2:] - padh = 1024 - h - padw = 1024 - w - x = np.pad(resized_image, ((0, 0), (0, 0), (0, padh), (0, padw))) - return x - - - def postprocess_masks(masks: np.ndarray, orig_size): - size_before_pad = resizer.get_preprocess_shape(orig_size[0], orig_size[1], masks.shape[-1]) - masks = masks[..., : int(size_before_pad[0]), : int(size_before_pad[1])] - masks = torch.nn.functional.interpolate(torch.from_numpy(masks), size=orig_size, mode="bilinear", align_corners=False).numpy() - return masks - -.. code:: ipython3 - - def show_mask(mask, ax): - color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6]) - h, w = mask.shape[-2:] - mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1) - ax.imshow(mask_image) - - - def show_points(coords, labels, ax, marker_size=375): - pos_points = coords[labels == 1] - neg_points = coords[labels == 0] - ax.scatter( - pos_points[:, 0], - pos_points[:, 1], - color="green", - marker="*", - s=marker_size, - edgecolor="white", - linewidth=1.25, - ) - ax.scatter( - neg_points[:, 0], - neg_points[:, 1], - color="red", - marker="*", - s=marker_size, - edgecolor="white", - linewidth=1.25, - ) - - - def show_box(box, ax): - x0, y0 = box[0], box[1] - w, h = box[2] - box[0], box[3] - box[1] - ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2)) - -Image encoding -~~~~~~~~~~~~~~ - - - -To start work with image, we should preprocess it and obtain image -embeddings using ``ov_encoder``. We will use the same image for all -experiments, so it is possible to generate image embedding once and then -reuse them. - -.. code:: ipython3 - - preprocessed_image = preprocess_image(image) - encoding_results = ov_encoder(preprocessed_image) - - image_embeddings = encoding_results[ov_encoder.output(0)] - -Now, we can try to provide different prompts for mask generation - -Example point input -~~~~~~~~~~~~~~~~~~~ - - - -In this example we select one point. The green star symbol show its -location on the image below. - -.. code:: ipython3 - - input_point = np.array([[500, 375]]) - input_label = np.array([1]) - - plt.figure(figsize=(10, 10)) - plt.imshow(image) - show_points(input_point, input_label, plt.gca()) - plt.axis("off") - plt.show() - - - -.. image:: segment-anything-with-output_files/segment-anything-with-output_28_0.png - - -Add a batch index, concatenate a padding point, and transform it to -input tensor coordinate system. - -.. code:: ipython3 - - coord = np.concatenate([input_point, np.array([[0.0, 0.0]])], axis=0)[None, :, :] - label = np.concatenate([input_label, np.array([-1])], axis=0)[None, :].astype(np.float32) - coord = resizer.apply_coords(coord, image.shape[:2]).astype(np.float32) - -Package the inputs to run in the mask predictor. - -.. code:: ipython3 - - inputs = { - "image_embeddings": image_embeddings, - "point_coords": coord, - "point_labels": label, - } - -Predict a mask and threshold it to get binary mask (0 - no object, 1 - -object). - -.. code:: ipython3 - - results = ov_predictor(inputs) - - masks = results[ov_predictor.output(0)] - masks = postprocess_masks(masks, image.shape[:-1]) - masks = masks > 0.0 - -.. code:: ipython3 - - plt.figure(figsize=(10, 10)) - plt.imshow(image) - show_mask(masks, plt.gca()) - show_points(input_point, input_label, plt.gca()) - plt.axis("off") - plt.show() - - - -.. image:: segment-anything-with-output_files/segment-anything-with-output_35_0.png - - -Example with multiple points -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -in this example, we provide additional point for cover larger object -area. - -.. code:: ipython3 - - input_point = np.array([[500, 375], [1125, 625], [575, 750], [1405, 575]]) - input_label = np.array([1, 1, 1, 1]) - -Now, prompt for model looks like represented on this image: - -.. code:: ipython3 - - plt.figure(figsize=(10, 10)) - plt.imshow(image) - show_points(input_point, input_label, plt.gca()) - plt.axis("off") - plt.show() - - - -.. image:: segment-anything-with-output_files/segment-anything-with-output_39_0.png - - -Transform the points as in the previous example. - -.. code:: ipython3 - - coord = np.concatenate([input_point, np.array([[0.0, 0.0]])], axis=0)[None, :, :] - label = np.concatenate([input_label, np.array([-1])], axis=0)[None, :].astype(np.float32) - - coord = resizer.apply_coords(coord, image.shape[:2]).astype(np.float32) - -Package inputs, then predict and threshold the mask. - -.. code:: ipython3 - - inputs = { - "image_embeddings": image_embeddings, - "point_coords": coord, - "point_labels": label, - } - - results = ov_predictor(inputs) - - masks = results[ov_predictor.output(0)] - masks = postprocess_masks(masks, image.shape[:-1]) - masks = masks > 0.0 - -.. code:: ipython3 - - plt.figure(figsize=(10, 10)) - plt.imshow(image) - show_mask(masks, plt.gca()) - show_points(input_point, input_label, plt.gca()) - plt.axis("off") - plt.show() - - - -.. image:: segment-anything-with-output_files/segment-anything-with-output_44_0.png - - -Great! Looks like now, predicted mask cover whole truck. - -Example box and point input with negative label -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -In this example we define input prompt using bounding box and point -inside it.The bounding box represented as set of points of its left -upper corner and right lower corner. Label 0 for point speak that this -point should be excluded from mask. - -.. code:: ipython3 - - input_box = np.array([425, 600, 700, 875]) - input_point = np.array([[575, 750]]) - input_label = np.array([0]) - -.. code:: ipython3 - - plt.figure(figsize=(10, 10)) - plt.imshow(image) - show_box(input_box, plt.gca()) - show_points(input_point, input_label, plt.gca()) - plt.axis("off") - plt.show() - - - -.. image:: segment-anything-with-output_files/segment-anything-with-output_48_0.png - - -Add a batch index, concatenate a box and point inputs, add the -appropriate labels for the box corners, and transform. There is no -padding point since the input includes a box input. - -.. code:: ipython3 - - box_coords = input_box.reshape(2, 2) - box_labels = np.array([2, 3]) - - coord = np.concatenate([input_point, box_coords], axis=0)[None, :, :] - label = np.concatenate([input_label, box_labels], axis=0)[None, :].astype(np.float32) - - coord = resizer.apply_coords(coord, image.shape[:2]).astype(np.float32) - -Package inputs, then predict and threshold the mask. - -.. code:: ipython3 - - inputs = { - "image_embeddings": image_embeddings, - "point_coords": coord, - "point_labels": label, - } - - results = ov_predictor(inputs) - - masks = results[ov_predictor.output(0)] - masks = postprocess_masks(masks, image.shape[:-1]) - masks = masks > 0.0 - -.. code:: ipython3 - - plt.figure(figsize=(10, 10)) - plt.imshow(image) - show_mask(masks[0], plt.gca()) - show_box(input_box, plt.gca()) - show_points(input_point, input_label, plt.gca()) - plt.axis("off") - plt.show() - - - -.. image:: segment-anything-with-output_files/segment-anything-with-output_53_0.png - - -Interactive segmentation ------------------------- - - - -Now, you can try SAM on own image. Upload image to input window and -click on desired point, model predict segment based on your image and -point. - -.. code:: ipython3 - - class Segmenter: - def __init__(self, ov_encoder, ov_predictor): - self.encoder = ov_encoder - self.predictor = ov_predictor - self._img_embeddings = None - - def set_image(self, img: np.ndarray): - if self._img_embeddings is not None: - del self._img_embeddings - preprocessed_image = preprocess_image(img) - encoding_results = self.encoder(preprocessed_image) - image_embeddings = encoding_results[ov_encoder.output(0)] - self._img_embeddings = image_embeddings - return img - - def get_mask(self, points, img): - coord = np.array(points) - coord = np.concatenate([coord, np.array([[0, 0]])], axis=0) - coord = coord[None, :, :] - label = np.concatenate([np.ones(len(points)), np.array([-1])], axis=0)[None, :].astype(np.float32) - coord = resizer.apply_coords(coord, img.shape[:2]).astype(np.float32) - if self._img_embeddings is None: - self.set_image(img) - inputs = { - "image_embeddings": self._img_embeddings, - "point_coords": coord, - "point_labels": label, - } - - results = self.predictor(inputs) - masks = results[ov_predictor.output(0)] - masks = postprocess_masks(masks, img.shape[:-1]) - - masks = masks > 0.0 - mask = masks[0] - mask = np.transpose(mask, (1, 2, 0)) - return mask - - - segmenter = Segmenter(ov_encoder, ov_predictor) - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/segment-anything/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(segmenter) - - try: - demo.launch() - except Exception: - demo.launch(share=True) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name="your server name", server_port="server port in int")` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ - -Run OpenVINO model in automatic mask generation mode ----------------------------------------------------- - - - -Since SAM can efficiently process prompts, masks for the entire image -can be generated by sampling a large number of prompts over an image. -``automatic_mask_generation`` function implements this capability. It -works by sampling single-point input prompts in a grid over the image, -from each of which SAM can predict multiple masks. Then, masks are -filtered for quality and deduplicated using non-maximal suppression. -Additional options allow for further improvement of mask quality and -quantity, such as running prediction on multiple crops of the image or -postprocessing masks to remove small disconnected regions and holes. - -.. code:: ipython3 - - from segment_anything.utils.amg import ( - MaskData, - generate_crop_boxes, - uncrop_boxes_xyxy, - uncrop_masks, - uncrop_points, - calculate_stability_score, - rle_to_mask, - batched_mask_to_box, - mask_to_rle_pytorch, - is_box_near_crop_edge, - batch_iterator, - remove_small_regions, - build_all_layer_point_grids, - box_xyxy_to_xywh, - area_from_rle, - ) - from torchvision.ops.boxes import batched_nms, box_area - from typing import Tuple, List, Dict, Any - -.. code:: ipython3 - - def process_batch( - image_embedding: np.ndarray, - points: np.ndarray, - im_size: Tuple[int, ...], - crop_box: List[int], - orig_size: Tuple[int, ...], - iou_thresh, - mask_threshold, - stability_score_offset, - stability_score_thresh, - ) -> MaskData: - orig_h, orig_w = orig_size - - # Run model on this batch - transformed_points = resizer.apply_coords(points, im_size) - in_points = transformed_points - in_labels = np.ones(in_points.shape[0], dtype=int) - - inputs = { - "image_embeddings": image_embedding, - "point_coords": in_points[:, None, :], - "point_labels": in_labels[:, None], - } - res = ov_predictor(inputs) - masks = postprocess_masks(res[ov_predictor.output(0)], orig_size) - masks = torch.from_numpy(masks) - iou_preds = torch.from_numpy(res[ov_predictor.output(1)]) - - # Serialize predictions and store in MaskData - data = MaskData( - masks=masks.flatten(0, 1), - iou_preds=iou_preds.flatten(0, 1), - points=torch.as_tensor(points.repeat(masks.shape[1], axis=0)), - ) - del masks - - # Filter by predicted IoU - if iou_thresh > 0.0: - keep_mask = data["iou_preds"] > iou_thresh - data.filter(keep_mask) - - # Calculate stability score - data["stability_score"] = calculate_stability_score(data["masks"], mask_threshold, stability_score_offset) - if stability_score_thresh > 0.0: - keep_mask = data["stability_score"] >= stability_score_thresh - data.filter(keep_mask) - - # Threshold masks and calculate boxes - data["masks"] = data["masks"] > mask_threshold - data["boxes"] = batched_mask_to_box(data["masks"]) - - # Filter boxes that touch crop boundaries - keep_mask = ~is_box_near_crop_edge(data["boxes"], crop_box, [0, 0, orig_w, orig_h]) - if not torch.all(keep_mask): - data.filter(keep_mask) - - # Compress to RLE - data["masks"] = uncrop_masks(data["masks"], crop_box, orig_h, orig_w) - data["rles"] = mask_to_rle_pytorch(data["masks"]) - del data["masks"] - - return data - -.. code:: ipython3 - - def process_crop( - image: np.ndarray, - point_grids, - crop_box: List[int], - crop_layer_idx: int, - orig_size: Tuple[int, ...], - box_nms_thresh: float = 0.7, - mask_threshold: float = 0.0, - points_per_batch: int = 64, - pred_iou_thresh: float = 0.88, - stability_score_thresh: float = 0.95, - stability_score_offset: float = 1.0, - ) -> MaskData: - # Crop the image and calculate embeddings - x0, y0, x1, y1 = crop_box - cropped_im = image[y0:y1, x0:x1, :] - cropped_im_size = cropped_im.shape[:2] - preprocessed_cropped_im = preprocess_image(cropped_im) - crop_embeddings = ov_encoder(preprocessed_cropped_im)[ov_encoder.output(0)] - - # Get points for this crop - points_scale = np.array(cropped_im_size)[None, ::-1] - points_for_image = point_grids[crop_layer_idx] * points_scale - - # Generate masks for this crop in batches - data = MaskData() - for (points,) in batch_iterator(points_per_batch, points_for_image): - batch_data = process_batch( - crop_embeddings, - points, - cropped_im_size, - crop_box, - orig_size, - pred_iou_thresh, - mask_threshold, - stability_score_offset, - stability_score_thresh, - ) - data.cat(batch_data) - del batch_data - - # Remove duplicates within this crop. - keep_by_nms = batched_nms( - data["boxes"].float(), - data["iou_preds"], - torch.zeros(len(data["boxes"])), # categories - iou_threshold=box_nms_thresh, - ) - data.filter(keep_by_nms) - - # Return to the original image frame - data["boxes"] = uncrop_boxes_xyxy(data["boxes"], crop_box) - data["points"] = uncrop_points(data["points"], crop_box) - data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(data["rles"]))]) - - return data - -.. code:: ipython3 - - def generate_masks(image: np.ndarray, point_grids, crop_n_layers, crop_overlap_ratio, crop_nms_thresh) -> MaskData: - orig_size = image.shape[:2] - crop_boxes, layer_idxs = generate_crop_boxes(orig_size, crop_n_layers, crop_overlap_ratio) - - # Iterate over image crops - data = MaskData() - for crop_box, layer_idx in zip(crop_boxes, layer_idxs): - crop_data = process_crop(image, point_grids, crop_box, layer_idx, orig_size) - data.cat(crop_data) - - # Remove duplicate masks between crops - if len(crop_boxes) > 1: - # Prefer masks from smaller crops - scores = 1 / box_area(data["crop_boxes"]) - scores = scores.to(data["boxes"].device) - keep_by_nms = batched_nms( - data["boxes"].float(), - scores, - torch.zeros(len(data["boxes"])), # categories - iou_threshold=crop_nms_thresh, - ) - data.filter(keep_by_nms) - - data.to_numpy() - return data - -.. code:: ipython3 - - def postprocess_small_regions(mask_data: MaskData, min_area: int, nms_thresh: float) -> MaskData: - """ - Removes small disconnected regions and holes in masks, then reruns - box NMS to remove any new duplicates. - - Edits mask_data in place. - - Requires open-cv as a dependency. - """ - if len(mask_data["rles"]) == 0: - return mask_data - - # Filter small disconnected regions and holes - new_masks = [] - scores = [] - for rle in mask_data["rles"]: - mask = rle_to_mask(rle) - - mask, changed = remove_small_regions(mask, min_area, mode="holes") - unchanged = not changed - mask, changed = remove_small_regions(mask, min_area, mode="islands") - unchanged = unchanged and not changed - - new_masks.append(torch.as_tensor(mask).unsqueeze(0)) - # Give score=0 to changed masks and score=1 to unchanged masks - # so NMS will prefer ones that didn't need postprocessing - scores.append(float(unchanged)) - - # Recalculate boxes and remove any new duplicates - masks = torch.cat(new_masks, dim=0) - boxes = batched_mask_to_box(masks) - keep_by_nms = batched_nms( - boxes.float(), - torch.as_tensor(scores), - torch.zeros(len(boxes)), # categories - iou_threshold=nms_thresh, - ) - - # Only recalculate RLEs for masks that have changed - for i_mask in keep_by_nms: - if scores[i_mask] == 0.0: - mask_torch = masks[i_mask].unsqueeze(0) - mask_data["rles"][i_mask] = mask_to_rle_pytorch(mask_torch)[0] - # update res directly - mask_data["boxes"][i_mask] = boxes[i_mask] - mask_data.filter(keep_by_nms) - - return mask_data - -There are several tunable parameters in automatic mask generation that -control how densely points are sampled and what the thresholds are for -removing low quality or duplicate masks. Additionally, generation can be -automatically run on crops of the image to get improved performance on -smaller objects, and post-processing can remove stray pixels and holes - -.. code:: ipython3 - - def automatic_mask_generation( - image: np.ndarray, - min_mask_region_area: int = 0, - points_per_side: int = 32, - crop_n_layers: int = 0, - crop_n_points_downscale_factor: int = 1, - crop_overlap_ratio: float = 512 / 1500, - box_nms_thresh: float = 0.7, - crop_nms_thresh: float = 0.7, - ) -> List[Dict[str, Any]]: - """ - Generates masks for the given image. - - Arguments: - image (np.ndarray): The image to generate masks for, in HWC uint8 format. - - Returns: - list(dict(str, any)): A list over records for masks. Each record is - a dict containing the following keys: - segmentation (dict(str, any) or np.ndarray): The mask. If - output_mode='binary_mask', is an array of shape HW. Otherwise, - is a dictionary containing the RLE. - bbox (list(float)): The box around the mask, in XYWH format. - area (int): The area in pixels of the mask. - predicted_iou (float): The model's own prediction of the mask's - quality. This is filtered by the pred_iou_thresh parameter. - point_coords (list(list(float))): The point coordinates input - to the model to generate this mask. - stability_score (float): A measure of the mask's quality. This - is filtered on using the stability_score_thresh parameter. - crop_box (list(float)): The crop of the image used to generate - the mask, given in XYWH format. - """ - point_grids = build_all_layer_point_grids( - points_per_side, - crop_n_layers, - crop_n_points_downscale_factor, - ) - mask_data = generate_masks(image, point_grids, crop_n_layers, crop_overlap_ratio, crop_nms_thresh) - - # Filter small disconnected regions and holes in masks - if min_mask_region_area > 0: - mask_data = postprocess_small_regions( - mask_data, - min_mask_region_area, - max(box_nms_thresh, crop_nms_thresh), - ) - - mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]] - - # Write mask records - curr_anns = [] - for idx in range(len(mask_data["segmentations"])): - ann = { - "segmentation": mask_data["segmentations"][idx], - "area": area_from_rle(mask_data["rles"][idx]), - "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(), - "predicted_iou": mask_data["iou_preds"][idx].item(), - "point_coords": [mask_data["points"][idx].tolist()], - "stability_score": mask_data["stability_score"][idx].item(), - "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(), - } - curr_anns.append(ann) - - return curr_anns - -.. code:: ipython3 - - prediction = automatic_mask_generation(image) - -``automatic_mask_generation`` returns a list over masks, where each mask -is a dictionary containing various data about the mask. These keys are: - -- ``segmentation`` : the mask -- ``area`` : the area of the mask in pixels -- ``bbox`` : the boundary box of the mask in XYWH format -- ``predicted_iou`` : the model’s own prediction for the quality of the - mask -- ``point_coords`` : the sampled input point that generated this mask -- ``stability_score`` : an additional measure of mask quality -- ``crop_box`` : the crop of the image used to generate this mask in - XYWH format - -.. code:: ipython3 - - print(f"Number of detected masks: {len(prediction)}") - print(f"Annotation keys: {prediction[0].keys()}") - - -.. parsed-literal:: - - Number of detected masks: 48 - Annotation keys: dict_keys(['segmentation', 'area', 'bbox', 'predicted_iou', 'point_coords', 'stability_score', 'crop_box']) - - -.. code:: ipython3 - - from tqdm.notebook import tqdm - - - def draw_anns(image, anns): - if len(anns) == 0: - return - segments_image = image.copy() - sorted_anns = sorted(anns, key=(lambda x: x["area"]), reverse=True) - for ann in tqdm(sorted_anns): - mask = ann["segmentation"] - mask_color = np.random.randint(0, 255, size=(1, 1, 3)).astype(np.uint8) - segments_image[mask] = mask_color - return cv2.addWeighted(image.astype(np.float32), 0.7, segments_image.astype(np.float32), 0.3, 0.0) - -.. code:: ipython3 - - import PIL - - out = draw_anns(image, prediction) - cv2.imwrite("result.png", out[:, :, ::-1]) - - PIL.Image.open("result.png") - - - -.. parsed-literal:: - - 0%| | 0/48 [00:00`__ provides a suite of -advanced algorithms for Neural Networks inference optimization in -OpenVINO with minimal accuracy drop. - -Since encoder costing much more time than other parts in SAM inference -pipeline, we will use 8-bit quantization in post-training mode (without -the fine-tuning pipeline) to optimize encoder of SAM. - -The optimization process contains the following steps: - -1. Create a Dataset for quantization. -2. Run ``nncf.quantize`` for getting an optimized model. -3. Serialize OpenVINO IR model, using the ``openvino.save_model`` - function. - -Prepare a calibration dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Download COCO dataset. Since the dataset is used to calibrate the -model’s parameter instead of fine-tuning it, we don’t need to download -the label files. - -.. code:: ipython3 - - from zipfile import ZipFile - - DATA_URL = "https://ultralytics.com/assets/coco128.zip" - OUT_DIR = Path(".") - - if not (OUT_DIR / "coco128/images/train2017").exists(): - download_file(DATA_URL, directory=OUT_DIR, show_progress=True) - with ZipFile("coco128.zip", "r") as zip_ref: - zip_ref.extractall(OUT_DIR) - - -.. parsed-literal:: - - 'coco128.zip' already exists. - - -Create an instance of the ``nncf.Dataset`` class that represents the -calibration dataset. For PyTorch, we can pass an instance of the -``torch.utils.data.DataLoader`` object. - -.. code:: ipython3 - - import torch.utils.data as data - - - class COCOLoader(data.Dataset): - def __init__(self, images_path): - self.images = list(Path(images_path).iterdir()) - - def __getitem__(self, index): - image_path = self.images[index] - image = cv2.imread(str(image_path)) - image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - return image - - def __len__(self): - return len(self.images) - - - coco_dataset = COCOLoader(OUT_DIR / "coco128/images/train2017") - calibration_loader = torch.utils.data.DataLoader(coco_dataset) - -The transformation function is a function that takes a sample from the -dataset and returns data that can be passed to the model for inference. - -.. code:: ipython3 - - import nncf - - - def transform_fn(image_data): - """ - Quantization transform function. Extracts and preprocess input data from dataloader item for quantization. - Parameters: - image_data: image data produced by DataLoader during iteration - Returns: - input_tensor: input data in Dict format for model quantization - """ - image = image_data.numpy() - processed_image = preprocess_image(np.squeeze(image)) - return processed_image - - - calibration_dataset = nncf.Dataset(calibration_loader, transform_fn) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -Run quantization and serialize OpenVINO IR model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The ``nncf.quantize`` function provides an interface for model -quantization. It requires an instance of the OpenVINO Model and -quantization dataset. It is available for models in the following -frameworks: ``PyTorch``, ``TensorFlow 2.x``, ``ONNX``, and -``OpenVINO IR``. - -Optionally, some additional parameters for the configuration -quantization process (number of samples for quantization, preset, model -type, etc.) can be provided. ``model_type`` can be used to specify -quantization scheme required for specific type of the model. For -example, Transformer models such as SAM require a special quantization -scheme to preserve accuracy after quantization. To achieve a better -result, we will use a ``mixed`` quantization preset. It provides -symmetric quantization of weights and asymmetric quantization of -activations. - - **Note**: Model post-training quantization is time-consuming process. - Be patient, it can take several minutes depending on your hardware. - -.. code:: ipython3 - - model = core.read_model(ov_encoder_path) - quantized_model = nncf.quantize( - model, - calibration_dataset, - model_type=nncf.parameters.ModelType.TRANSFORMER, - subset_size=128, - ) - print("model quantization finished") - - -.. parsed-literal:: - - 2023-09-11 20:39:36.145499: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2023-09-11 20:39:36.181406: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2023-09-11 20:39:36.769588: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - Statistics collection: 100%|██████████████████| 128/128 [02:12<00:00, 1.03s/it] - Applying Smooth Quant: 100%|████████████████████| 48/48 [00:01<00:00, 32.29it/s] - - -.. parsed-literal:: - - INFO:nncf:36 ignored nodes was found by name in the NNCFGraph - - -.. parsed-literal:: - - Statistics collection: 100%|██████████████████| 128/128 [04:36<00:00, 2.16s/it] - Applying Fast Bias correction: 100%|████████████| 49/49 [00:28<00:00, 1.72it/s] - -.. parsed-literal:: - - model quantization finished - - - - - - - -.. code:: ipython3 - - ov_encoder_path_int8 = "sam_image_encoder_int8.xml" - ov.save_model(quantized_model, ov_encoder_path_int8) - -Validate Quantized Model Inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We can reuse the previous code to validate the output of ``INT8`` model. - -.. code:: ipython3 - - # Load INT8 model and run pipeline again - ov_encoder_model_int8 = core.read_model(ov_encoder_path_int8) - ov_encoder_int8 = core.compile_model(ov_encoder_model_int8, device.value) - encoding_results = ov_encoder_int8(preprocessed_image) - image_embeddings = encoding_results[ov_encoder_int8.output(0)] - - input_point = np.array([[500, 375]]) - input_label = np.array([1]) - coord = np.concatenate([input_point, np.array([[0.0, 0.0]])], axis=0)[None, :, :] - label = np.concatenate([input_label, np.array([-1])], axis=0)[None, :].astype(np.float32) - - coord = resizer.apply_coords(coord, image.shape[:2]).astype(np.float32) - inputs = { - "image_embeddings": image_embeddings, - "point_coords": coord, - "point_labels": label, - } - results = ov_predictor(inputs) - - masks = results[ov_predictor.output(0)] - masks = postprocess_masks(masks, image.shape[:-1]) - masks = masks > 0.0 - plt.figure(figsize=(10, 10)) - plt.imshow(image) - show_mask(masks, plt.gca()) - show_points(input_point, input_label, plt.gca()) - plt.axis("off") - plt.show() - - - -.. image:: segment-anything-with-output_files/segment-anything-with-output_81_0.png - - -Run ``INT8`` model in automatic mask generation mode - -.. code:: ipython3 - - ov_encoder = ov_encoder_int8 - prediction = automatic_mask_generation(image) - out = draw_anns(image, prediction) - cv2.imwrite("result_int8.png", out[:, :, ::-1]) - PIL.Image.open("result_int8.png") - - - -.. parsed-literal:: - - 0%| | 0/47 [00:00`__ -to measure the inference performance of the ``FP32`` and ``INT8`` -models. - -.. code:: ipython3 - - # Inference FP32 model (OpenVINO IR) - !benchmark_app -m $ov_encoder_path -d $device.value - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ WARNING ] Default duration 120 seconds is used for unknown device AUTO - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2023.1.0-12050-e33de350633 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2023.1.0-12050-e33de350633 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 31.21 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,3,1024,1024] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.neck.3/aten::add/Add_2933) : f32 / [...] / [1,256,64,64] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,1024,1024] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.neck.3/aten::add/Add_2933) : f32 / [...] / [1,256,64,64] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 956.62 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model474 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] NETWORK_NAME: Model474 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: False - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 120000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 3347.39 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 132 iterations - [ INFO ] Duration: 135907.17 ms - [ INFO ] Latency: - [ INFO ] Median: 12159.63 ms - [ INFO ] Average: 12098.43 ms - [ INFO ] Min: 7652.77 ms - [ INFO ] Max: 13027.98 ms - [ INFO ] Throughput: 0.97 FPS - - -.. code:: ipython3 - - # Inference INT8 model (OpenVINO IR) - !benchmark_app -m $ov_encoder_path_int8 -d $device.value - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ WARNING ] Default duration 120 seconds is used for unknown device AUTO - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2023.1.0-12050-e33de350633 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2023.1.0-12050-e33de350633 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 40.67 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,3,1024,1024] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.neck.3/aten::add/Add_2933) : f32 / [...] / [1,256,64,64] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,1024,1024] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.neck.3/aten::add/Add_2933) : f32 / [...] / [1,256,64,64] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 1151.47 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model474 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] NETWORK_NAME: Model474 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: False - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 120000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 1951.78 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 216 iterations - [ INFO ] Duration: 130123.96 ms - [ INFO ] Latency: - [ INFO ] Median: 7192.03 ms - [ INFO ] Average: 7197.18 ms - [ INFO ] Min: 6134.35 ms - [ INFO ] Max: 7888.28 ms - [ INFO ] Throughput: 1.66 FPS - diff --git a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_21_0.png b/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_21_0.png deleted file mode 100644 index 0f4fae19177398..00000000000000 --- a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_21_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:346047e07dbc27868c4617ea1a9a0b57e6788980308c2cf9785112639b6202f9 -size 467418 diff --git a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_28_0.png b/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_28_0.png deleted file mode 100644 index 777175ba957427..00000000000000 --- a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_28_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:706d62427504453c1dabd26b6f91dcf1bc4354ff8c357972aee57499258d2945 -size 468529 diff --git a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_35_0.png b/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_35_0.png deleted file mode 100644 index 88d01d5d96a7d3..00000000000000 --- a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_35_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eef2326ff43ff103751c1e4e985eb1d00584bc43deaad7bae0becb27840731cf -size 469443 diff --git a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_39_0.png b/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_39_0.png deleted file mode 100644 index 8def1b9bc0117e..00000000000000 --- a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_39_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7f5fb120f01a644da1f7c8d5ff12850bac6da2dc208a95741230ad3c251598e0 -size 470668 diff --git a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_44_0.png b/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_44_0.png deleted file mode 100644 index 849e7f5b4aa04f..00000000000000 --- a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_44_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f237227062ab2ccd8ef03d3b0b3f841c6e68ad5c55737b4ae4852bb73a30f22d -size 468092 diff --git a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_48_0.png b/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_48_0.png deleted file mode 100644 index aa6e6e273bbd19..00000000000000 --- a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_48_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:26031e0dc1c7895368afbed82a19bbcf62f0e6c9cbc5cc6add149f7e2ddca211 -size 468088 diff --git a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_53_0.png b/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_53_0.png deleted file mode 100644 index de86300f41102c..00000000000000 --- a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_53_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:687b590a1853070d43145ac92db47f2cf93bfb245b970e2aaf4fd2dfef4f0f5c -size 472756 diff --git a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_69_1.jpg b/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_69_1.jpg deleted file mode 100644 index 837f6ddf6b741b..00000000000000 --- a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_69_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:05935769e17190de09a2b0f985e1fd2687e23d26c96c06ec603347f17e38f4b1 -size 262203 diff --git a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_69_1.png b/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_69_1.png deleted file mode 100644 index e21b54d61e7d2c..00000000000000 --- a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_69_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd940fe035baf9e0bf49a12c16132dd37fb75a8ab69a8c7dad069ed31265699 -size 2409333 diff --git a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_81_0.png b/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_81_0.png deleted file mode 100644 index 09e40842442380..00000000000000 --- a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_81_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5efe373fadc700dc7e9651c52bd657ac6075b75ee657449e3f3e8960b31487bd -size 469432 diff --git a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_83_1.jpg b/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_83_1.jpg deleted file mode 100644 index 0b5ca693a9ded4..00000000000000 --- a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_83_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4ba1f085c14700f85188daf0d68c2cc23a9b42066f3fc3179dd8991c96df5497 -size 262535 diff --git a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_83_1.png b/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_83_1.png deleted file mode 100644 index 9ed47e66f764a4..00000000000000 --- a/docs/notebooks/segment-anything-with-output_files/segment-anything-with-output_83_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8d8ff846bbc449eb1e1cdb8b7e33b71365c8d514ca55f4221871dfbe45ec5444 -size 2397126 diff --git a/docs/notebooks/siglip-zero-shot-image-classification-with-output.rst b/docs/notebooks/siglip-zero-shot-image-classification-with-output.rst deleted file mode 100644 index 900b3abd24613d..00000000000000 --- a/docs/notebooks/siglip-zero-shot-image-classification-with-output.rst +++ /dev/null @@ -1,635 +0,0 @@ -Zero-shot Image Classification with SigLIP -========================================== - -|Colab| - -Zero-shot image classification is a computer vision task to classify -images into one of several classes without any prior training or -knowledge of the classes. - -.. figure:: https://user-images.githubusercontent.com/29454499/207773481-d77cacf8-6cdc-4765-a31b-a1669476d620.png - :alt: zero-shot-pipeline - - zero-shot-pipeline - -`\**image -source\* `__ - -Zero-shot learning resolves several challenges in image retrieval -systems. For example, with the rapid growth of categories on the web, it -is challenging to index images based on unseen categories. We can -associate unseen categories to images with zero-shot learning by -exploiting attributes to model’s relationship between visual features -and labels. In this tutorial, we will use the -`SigLIP `__ -model to perform zero-shot image classification. - - -**Table of contents:** - - -- `Instantiate model <#instantiate-model>`__ -- `Run PyTorch model inference <#run-pytorch-model-inference>`__ -- `Convert model to OpenVINO Intermediate Representation (IR) - format <#convert-model-to-openvino-intermediate-representation-ir-format>`__ -- `Run OpenVINO model <#run-openvino-model>`__ -- `Apply post-training quantization using - NNCF <#apply-post-training-quantization-using-nncf>`__ - - - `Prepare dataset <#prepare-dataset>`__ - - `Quantize model <#quantize-model>`__ - - `Run quantized OpenVINO model <#run-quantized-openvino-model>`__ - - `Compare File Size <#compare-file-size>`__ - - `Compare inference time of the FP16 IR and quantized - models <#compare-inference-time-of-the-fp16-ir-and-quantized-models>`__ - -- `Interactive inference <#interactive-inference>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. |Colab| image:: https://colab.research.google.com/assets/colab-badge.svg - :target: https://colab.research.google.com/github/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/siglip-zero-shot-image-classification/siglip-zero-shot-image-classification.ipynb - -Instantiate model ------------------ - - - -The SigLIP model was proposed in `Sigmoid Loss for Language Image -Pre-Training `__. SigLIP proposes to -replace the loss function used in -`CLIP `__ (Contrastive Language–Image -Pre-training) by a simple pairwise sigmoid loss. This results in better -performance in terms of zero-shot classification accuracy on ImageNet. - -The abstract from the paper is the following: - - We propose a simple pairwise Sigmoid loss for Language-Image - Pre-training (SigLIP). Unlike standard contrastive learning with - softmax normalization, the sigmoid loss operates solely on image-text - pairs and does not require a global view of the pairwise similarities - for normalization. The sigmoid loss simultaneously allows further - scaling up the batch size, while also performing better at smaller - batch sizes. - -You can find more information about this model in the `research -paper `__, `GitHub -repository `__, `Hugging -Face model -page `__. - -In this notebook, we will use -`google/siglip-base-patch16-224 `__, -available via Hugging Face Transformers, but the same steps are -applicable for other CLIP family models. - -First, we need to create ``AutoModel`` class object and initialize it -with model configuration and weights, using ``from_pretrained`` method. -The model will be automatically downloaded from Hugging Face Hub and -cached for the next usage. ``AutoProcessor`` class is a wrapper for -input data preprocessing. It includes both encoding the text using -tokenizer and preparing the images. - -.. code:: ipython3 - - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "gradio>=4.19" "protobuf>=3.20.3" "openvino>=2024.4.0" "transformers>=4.37" "torch>=2.1" Pillow sentencepiece protobuf scipy datasets "nncf>=2.13.0" - %pip install -q "matplotlib>=3.4" - -.. code:: ipython3 - - from transformers import AutoProcessor, AutoModel - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("siglip-zero-shot-image-classification.ipynb") - - model = AutoModel.from_pretrained("google/siglip-base-patch16-224") - processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224") - -Run PyTorch model inference ---------------------------- - - - -To perform classification, define labels and load an image in RGB -format. To give the model wider text context and improve guidance, we -extend the labels description using the template “This is a photo of a”. -Both the list of label descriptions and image should be passed through -the processor to obtain a dictionary with input data in the -model-specific format. The model predicts an image-text similarity score -in raw logits format, which can be normalized to the ``[0, 1]`` range -using the ``softmax`` function. Then, we select labels with the highest -similarity score for the final result. - -.. code:: ipython3 - - # Results visualization function - from typing import List - import matplotlib.pyplot as plt - import numpy as np - from PIL import Image - - - def visualize_result(image: Image, labels: List[str], probs: np.ndarray, top: int = 5): - """ - Utility function for visualization classification results - params: - image: input image - labels: list of classification labels - probs: model predicted softmaxed probabilities for each label - top: number of the highest probability results for visualization - returns: - None - """ - plt.figure(figsize=(72, 64)) - top_labels = np.argsort(-probs)[: min(top, probs.shape[0])] - top_probs = probs[top_labels] - plt.subplot(8, 8, 1) - plt.imshow(image) - plt.axis("off") - - plt.subplot(8, 8, 2) - y = np.arange(top_probs.shape[-1]) - plt.grid() - plt.barh(y, top_probs) - plt.gca().invert_yaxis() - plt.gca().set_axisbelow(True) - plt.yticks(y, [labels[index] for index in top_labels]) - plt.xlabel("probability") - - print([{labels[x]: round(y, 2)} for x, y in zip(top_labels, top_probs)]) - -.. code:: ipython3 - - import requests - from pathlib import Path - import torch - from PIL import Image - - image_path = Path("test_image.jpg") - if not image_path.exists(): - r = requests.get( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco.jpg", - ) - - with image_path.open("wb") as f: - f.write(r.content) - image = Image.open(image_path) - - input_labels = [ - "cat", - "dog", - "wolf", - "tiger", - "man", - "horse", - "frog", - "tree", - "house", - "computer", - ] - text_descriptions = [f"This is a photo of a {label}" for label in input_labels] - - inputs = processor(text=text_descriptions, images=[image], padding="max_length", return_tensors="pt") - - with torch.no_grad(): - model.config.torchscript = False - results = model(**inputs) - - logits_per_image = results["logits_per_image"] # this is the image-text similarity score - - probs = logits_per_image.softmax(dim=1).detach().numpy() - visualize_result(image, input_labels, probs[0]) - - -.. parsed-literal:: - - [{'dog': 0.99}, {'cat': 0.0}, {'horse': 0.0}, {'wolf': 0.0}, {'tiger': 0.0}] - - - -.. image:: siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_6_1.png - - -Convert model to OpenVINO Intermediate Representation (IR) format ------------------------------------------------------------------ - - - -For best results with OpenVINO, it is recommended to convert the model -to OpenVINO IR format. OpenVINO supports PyTorch via Model conversion -API. To convert the PyTorch model to OpenVINO IR format we will use -``ov.convert_model`` of `model conversion -API `__. -The ``ov.convert_model`` Python function returns an OpenVINO Model -object ready to load on the device and start making predictions. - -.. code:: ipython3 - - import openvino as ov - - model.config.torchscript = True - ov_model = ov.convert_model(model, example_input=dict(inputs)) - -Run OpenVINO model ------------------- - - - -The steps for making predictions with the OpenVINO SigLIP model are -similar to the PyTorch model. Let us check the model result using the -same input data from the example above with PyTorch. - -Select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - -Run OpenVINO model - -.. code:: ipython3 - - from scipy.special import softmax - - - core = ov.Core() - # compile model for loading on device - compiled_ov_model = core.compile_model(ov_model, device.value) - # obtain output tensor for getting predictions - logits_per_image_out = compiled_ov_model.output(0) - # run inference on preprocessed data and get image-text similarity score - ov_logits_per_image = compiled_ov_model(dict(inputs))[logits_per_image_out] - # perform softmax on score - probs = softmax(ov_logits_per_image[0]) - # visualize prediction - visualize_result(image, input_labels, probs) - - -.. parsed-literal:: - - [{'dog': 0.99}, {'cat': 0.0}, {'horse': 0.0}, {'wolf': 0.0}, {'tiger': 0.0}] - - - -.. image:: siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_13_1.png - - -Great! Looks like we got the same result. - -Apply post-training quantization using NNCF -------------------------------------------- - - - -`NNCF `__ enables -post-training quantization by adding the quantization layers into the -model graph and then using a subset of the training dataset to -initialize the parameters of these additional quantization layers. The -framework is designed so that modifications to your original training -code are minor. Quantization is the simplest scenario and requires a few -modifications. - -The optimization process contains the following steps: - -1. Create a dataset for quantization. -2. Run ``nncf.quantize`` for getting a quantized model. - -Prepare dataset -~~~~~~~~~~~~~~~ - - - -The `Conceptual -Captions `__ dataset -consisting of ~3.3M images annotated with captions is used to quantize -model. - -.. code:: ipython3 - - import requests - from io import BytesIO - from PIL import Image - from requests.packages.urllib3.exceptions import InsecureRequestWarning - - requests.packages.urllib3.disable_warnings(InsecureRequestWarning) - - - def check_text_data(data): - """ - Check if the given data is text-based. - """ - if isinstance(data, str): - return True - if isinstance(data, list): - return all(isinstance(x, str) for x in data) - return False - - - def get_pil_from_url(url): - """ - Downloads and converts an image from a URL to a PIL Image object. - """ - response = requests.get(url, verify=False, timeout=20) - image = Image.open(BytesIO(response.content)) - return image.convert("RGB") - - - def collate_fn(example, image_column="image_url", text_column="caption"): - """ - Preprocesses an example by loading and transforming image and text data. - Checks if the text data in the example is valid by calling the `check_text_data` function. - Downloads the image specified by the URL in the image_column by calling the `get_pil_from_url` function. - If there is any error during the download process, returns None. - Returns the preprocessed inputs with transformed image and text data. - """ - assert len(example) == 1 - example = example[0] - - if not check_text_data(example[text_column]): - raise ValueError("Text data is not valid") - - url = example[image_column] - try: - image = get_pil_from_url(url) - h, w = image.size - if h == 1 or w == 1: - return None - except Exception: - return None - - inputs = processor( - text=example[text_column], - images=[image], - return_tensors="pt", - padding="max_length", - ) - if inputs["input_ids"].shape[1] > model.config.text_config.max_position_embeddings: - return None - return inputs - -.. code:: ipython3 - - import torch - from datasets import load_dataset - from tqdm.notebook import tqdm - - - def prepare_calibration_data(dataloader, init_steps): - """ - This function prepares calibration data from a dataloader for a specified number of initialization steps. - It iterates over the dataloader, fetching batches and storing the relevant data. - """ - data = [] - print(f"Fetching {init_steps} for the initialization...") - counter = 0 - for batch in tqdm(dataloader): - if counter == init_steps: - break - if batch: - counter += 1 - with torch.no_grad(): - data.append( - { - "pixel_values": batch["pixel_values"].to("cpu"), - "input_ids": batch["input_ids"].to("cpu"), - } - ) - return data - - - def prepare_dataset(opt_init_steps=300, max_train_samples=1000): - """ - Prepares a vision-text dataset for quantization. - """ - dataset = load_dataset("google-research-datasets/conceptual_captions", streaming=True, trust_remote_code=True) - train_dataset = dataset["train"].shuffle(seed=42, buffer_size=max_train_samples) - dataloader = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1) - calibration_data = prepare_calibration_data(dataloader, opt_init_steps) - return calibration_data - -.. code:: ipython3 - - calibration_data = prepare_dataset() - -Quantize model -~~~~~~~~~~~~~~ - - - -Create a quantized model from the pre-trained ``FP16`` model. - - **NOTE**: Quantization is time and memory consuming operation. - Running quantization code below may take a long time. - -.. code:: ipython3 - - import nncf - import logging - - nncf.set_log_level(logging.ERROR) - - if len(calibration_data) == 0: - raise RuntimeError("Calibration dataset is empty. Please check internet connection and try to download images manually.") - - calibration_dataset = nncf.Dataset(calibration_data) - quantized_ov_model = nncf.quantize( - model=ov_model, - calibration_dataset=calibration_dataset, - model_type=nncf.ModelType.TRANSFORMER, - ) - -NNCF also supports quantization-aware training, and other algorithms -than quantization. See the `NNCF -documentation `__ -in the NNCF repository for more information. - -Run quantized OpenVINO model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The steps for making predictions with the quantized OpenVINO SigLIP -model are similar to the PyTorch model. - -.. code:: ipython3 - - from scipy.special import softmax - - - input_labels = [ - "cat", - "dog", - "wolf", - "tiger", - "man", - "horse", - "frog", - "tree", - "house", - "computer", - ] - text_descriptions = [f"This is a photo of a {label}" for label in input_labels] - - inputs = processor(text=text_descriptions, images=[image], return_tensors="pt", padding="max_length") - compiled_int8_ov_model = ov.compile_model(quantized_ov_model, device.value) - - logits_per_image_out = compiled_int8_ov_model.output(0) - ov_logits_per_image = compiled_int8_ov_model(dict(inputs))[logits_per_image_out] - probs = softmax(ov_logits_per_image, axis=1) - visualize_result(image, input_labels, probs[0]) - - -.. parsed-literal:: - - [{'dog': 1.0}, {'horse': 0.0}, {'cat': 0.0}, {'wolf': 0.0}, {'frog': 0.0}] - - - -.. image:: siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_24_1.png - - -Compare File Size -~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from pathlib import Path - - fp16_model_path = "siglip-base-patch16-224.xml" - ov.save_model(ov_model, fp16_model_path) - - int8_model_path = "siglip-base-patch16-224_int8.xml" - ov.save_model(quantized_ov_model, int8_model_path) - - fp16_ir_model_size = Path(fp16_model_path).with_suffix(".bin").stat().st_size / 1024 / 1024 - quantized_model_size = Path(int8_model_path).with_suffix(".bin").stat().st_size / 1024 / 1024 - print(f"FP16 IR model size: {fp16_ir_model_size:.2f} MB") - print(f"INT8 model size: {quantized_model_size:.2f} MB") - print(f"Model compression rate: {fp16_ir_model_size / quantized_model_size:.3f}") - - -.. parsed-literal:: - - FP16 IR model size: 387.49 MB - INT8 model size: 196.46 MB - Model compression rate: 1.972 - - -Compare inference time of the FP16 IR and quantized models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To measure the inference performance of the ``FP16`` and ``INT8`` -models, we use median inference time on calibration dataset. So we can -approximately estimate the speed up of the dynamic quantized models. - - **NOTE**: For the most accurate performance estimation, it is - recommended to run ``benchmark_app`` in a terminal/command prompt - after closing other applications with static shapes. - -.. code:: ipython3 - - import time - - - def calculate_inference_time(model_path, calibration_data): - model = ov.compile_model(model_path, device.value) - output_layer = model.output(0) - inference_time = [] - for batch in calibration_data: - start = time.perf_counter() - _ = model(batch)[output_layer] - end = time.perf_counter() - delta = end - start - inference_time.append(delta) - return np.median(inference_time) - -.. code:: ipython3 - - fp16_latency = calculate_inference_time(fp16_model_path, calibration_data) - int8_latency = calculate_inference_time(int8_model_path, calibration_data) - print(f"Performance speed up: {fp16_latency / int8_latency:.3f}") - - -.. parsed-literal:: - - Performance speed up: 2.827 - - -Interactive inference ---------------------- - - - -Now, it is your turn! You can provide your own image and comma-separated -list of labels for zero-shot classification. Feel free to upload an -image, using the file upload window and type label names into the text -field, using comma as the separator (for example, ``cat,dog,bird``) - -.. code:: ipython3 - - def classify(image, text): - """Classify image using classes listing. - Args: - image (np.ndarray): image that needs to be classified in CHW format. - text (str): comma-separated list of class labels - Returns: - (dict): Mapping between class labels and class probabilities. - """ - labels = text.split(",") - text_descriptions = [f"This is a photo of a {label}" for label in labels] - inputs = processor( - text=text_descriptions, - images=[image], - return_tensors="np", - padding="max_length", - ) - ov_logits_per_image = compiled_int8_ov_model(dict(inputs))[logits_per_image_out] - probs = softmax(ov_logits_per_image[0]) - - return {label: float(prob) for label, prob in zip(labels, probs)} - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/siglip-zero-shot-image-classification/gradio_helper.py" - ) - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(classify) - - try: - demo.launch(debug=True, height=1000) - except Exception: - demo.launch(share=True, debug=True, height=1000) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_13_1.png b/docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_13_1.png deleted file mode 100644 index 29d0419274c0d1..00000000000000 --- a/docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_13_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5819cd4be115689362223af7969e6741829b6ccbbcba3e01cb1e12656e614df5 -size 581043 diff --git a/docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_24_1.png b/docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_24_1.png deleted file mode 100644 index ab972a707d732c..00000000000000 --- a/docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_24_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b03ef5c495f44beeb8ca13d3433a8e10426aae5e3060890c9ea7c1a495fe5db9 -size 580993 diff --git a/docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_6_1.png b/docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_6_1.png deleted file mode 100644 index 29d0419274c0d1..00000000000000 --- a/docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_6_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5819cd4be115689362223af7969e6741829b6ccbbcba3e01cb1e12656e614df5 -size 581043 diff --git a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output.rst b/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output.rst deleted file mode 100644 index 3619a7603d41c3..00000000000000 --- a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output.rst +++ /dev/null @@ -1,614 +0,0 @@ -One Step Sketch to Image translation with pix2pix-turbo and OpenVINO -==================================================================== - -Diffusion models achieve remarkable results in image generation. They -are able synthesize high-quality images guided by user instructions. In -the same time, majority of diffusion-based image generation approaches -are time-consuming due to the iterative denoising process.Pix2Pix-turbo -model was proposed in `One-Step Image Translation with Text-to-Image -Models paper `__ for addressing -slowness of diffusion process in image-to-image translation task. It is -based on `SD-Turbo `__, a -fast generative text-to-image model that can synthesize photorealistic -images from a text prompt in a single network evaluation. Using only -single inference, pix2pix-turbo achieves comparable by quality results -with recent works such as ControlNet for Sketch2Photo and Edge2Image for -50 steps. - -.. image:: https://github.com/GaParmar/img2img-turbo/raw/main/assets/gen_variations.jpg - -In this tutorial you will learn how to turn sketches to images using -`Pix2Pix-Turbo `__ and -OpenVINO. - -**Table of contents:** - -- `Prerequisites <#prerequisites>`__ -- `Load PyTorch model <#load-pytorch-model>`__ -- `Convert PyTorch model to Openvino Intermediate Representation - format <#convert-pytorch-model-to-openvino-intermediate-representation-format>`__ -- `Select inference device <#select-inference-device>`__ -- `Compile model <#compile-model>`__ -- `Run model inference <#run-model-inference>`__ -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Clone `model repository `__ -and install required packages. - -.. code:: ipython3 - - %pip install -qU "openvino>=2024.1.0" "torch>=2.1" torchvision "diffusers==0.25.1" "huggingface-hub<0.26.0" "peft>=0.6.2" transformers tqdm pillow opencv-python "gradio==3.43.1" --extra-index-url https://download.pytorch.org/whl/cpu - -.. code:: ipython3 - - import requests - from pathlib import Path - - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - -.. code:: ipython3 - - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("sketch-to-image-pix2pix-turbo.ipynb") - - from cmd_helper import clone_repo - - - repo_dir = clone_repo("https://github.com/GaParmar/img2img-turbo.git") - - pix2pix_turbo_py_path = repo_dir / "src/pix2pix_turbo.py" - model_py_path = repo_dir / "src/model.py" - orig_pix2pix_turbo_path = pix2pix_turbo_py_path.parent / ("orig_" + pix2pix_turbo_py_path.name) - orig_model_py_path = model_py_path.parent / ("orig_" + model_py_path.name) - - if not orig_pix2pix_turbo_path.exists(): - pix2pix_turbo_py_path.rename(orig_pix2pix_turbo_path) - - with orig_pix2pix_turbo_path.open("r") as f: - data = f.read() - data = data.replace("cuda", "cpu") - with pix2pix_turbo_py_path.open("w") as out_f: - out_f.write(data) - - if not orig_model_py_path.exists(): - model_py_path.rename(orig_model_py_path) - - with orig_model_py_path.open("r") as f: - data = f.read() - data = data.replace("cuda", "cpu") - with model_py_path.open("w") as out_f: - out_f.write(data) - %cd $repo_dir - -Load PyTorch model ------------------- - - - -Pix2Pix-turbo architecture illustrated on the diagram below. Model -combines three separate modules in the original latent diffusion models -into a single end-to-end network with small trainable weights. This -architecture allows translation the input image x to the output y, while -retaining the input scene structure. Authors use LoRA adapters in each -module, introduce skip connections and Zero-Convolutions between input -and output, and retrain the first layer of the U-Net. Blue boxes on -diagram indicate trainable layers. Semi-transparent layers are frozen. -|model_diagram| - -.. |model_diagram| image:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/18f1a442-8547-4edd-85b0-d8bd1a99bdf1 - -.. code:: ipython3 - - import copy - from tqdm import tqdm - import torch - from transformers import AutoTokenizer, CLIPTextModel - from diffusers import AutoencoderKL, UNet2DConditionModel - from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution - from diffusers.utils.peft_utils import set_weights_and_activate_adapters - from peft import LoraConfig - import types - - from src.model import make_1step_sched - from src.pix2pix_turbo import TwinConv - - tokenizer = AutoTokenizer.from_pretrained("stabilityai/sd-turbo", subfolder="tokenizer") - - - def tokenize_prompt(prompt): - caption_tokens = tokenizer(prompt, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt").input_ids - return caption_tokens - - - def _vae_encoder_fwd(self, sample): - sample = self.conv_in(sample) - l_blocks = [] - # down - for down_block in self.down_blocks: - l_blocks.append(sample) - sample = down_block(sample) - # middle - sample = self.mid_block(sample) - sample = self.conv_norm_out(sample) - sample = self.conv_act(sample) - sample = self.conv_out(sample) - current_down_blocks = l_blocks - return sample, current_down_blocks - - - def _vae_decoder_fwd(self, sample, incoming_skip_acts, latent_embeds=None): - sample = self.conv_in(sample) - upscale_dtype = next(iter(self.up_blocks.parameters())).dtype - # middle - sample = self.mid_block(sample, latent_embeds) - sample = sample.to(upscale_dtype) - if not self.ignore_skip: - skip_convs = [self.skip_conv_1, self.skip_conv_2, self.skip_conv_3, self.skip_conv_4] - # up - for idx, up_block in enumerate(self.up_blocks): - skip_in = skip_convs[idx](incoming_skip_acts[::-1][idx] * self.gamma) - # add skip - sample = sample + skip_in - sample = up_block(sample, latent_embeds) - else: - for idx, up_block in enumerate(self.up_blocks): - sample = up_block(sample, latent_embeds) - # post-process - if latent_embeds is None: - sample = self.conv_norm_out(sample) - else: - sample = self.conv_norm_out(sample, latent_embeds) - sample = self.conv_act(sample) - sample = self.conv_out(sample) - return sample - - - def vae_encode(self, x: torch.FloatTensor): - """ - Encode a batch of images into latents. - - Args: - x (`torch.FloatTensor`): Input batch of images. - - Returns: - The latent representations of the encoded images. If `return_dict` is True, a - [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned. - """ - h, down_blocks = self.encoder(x) - - moments = self.quant_conv(h) - posterior = DiagonalGaussianDistribution(moments) - - return (posterior, down_blocks) - - - def vae_decode(self, z: torch.FloatTensor, skip_acts): - decoded = self._decode(z, skip_acts)[0] - return (decoded,) - - - def vae__decode(self, z: torch.FloatTensor, skip_acts): - z = self.post_quant_conv(z) - dec = self.decoder(z, skip_acts) - - return (dec,) - - - class Pix2PixTurbo(torch.nn.Module): - def __init__(self, pretrained_name=None, pretrained_path=None, ckpt_folder="checkpoints", lora_rank_unet=8, lora_rank_vae=4): - super().__init__() - self.text_encoder = CLIPTextModel.from_pretrained("stabilityai/sd-turbo", subfolder="text_encoder", variant="fp16").cpu() - self.sched = make_1step_sched() - - vae = AutoencoderKL.from_pretrained("stabilityai/sd-turbo", subfolder="vae", variant="fp16") - vae.encoder.forward = types.MethodType(_vae_encoder_fwd, vae.encoder) - vae.decoder.forward = types.MethodType(_vae_decoder_fwd, vae.decoder) - vae.encode = types.MethodType(vae_encode, vae) - vae.decode = types.MethodType(vae_decode, vae) - vae._decode = types.MethodType(vae__decode, vae) - # add the skip connection convs - vae.decoder.skip_conv_1 = torch.nn.Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1), bias=False).cpu() - vae.decoder.skip_conv_2 = torch.nn.Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False).cpu() - vae.decoder.skip_conv_3 = torch.nn.Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False).cpu() - vae.decoder.skip_conv_4 = torch.nn.Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False).cpu() - vae.decoder.ignore_skip = False - unet = UNet2DConditionModel.from_pretrained("stabilityai/sd-turbo", subfolder="unet", variant="fp16") - ckpt_folder = Path(ckpt_folder) - - if pretrained_name == "edge_to_image": - url = "https://www.cs.cmu.edu/~img2img-turbo/models/edge_to_image_loras.pkl" - ckpt_folder.mkdir(exist_ok=True) - outf = ckpt_folder / "edge_to_image_loras.pkl" - if not outf: - print(f"Downloading checkpoint to {outf}") - response = requests.get(url, stream=True) - total_size_in_bytes = int(response.headers.get("content-length", 0)) - block_size = 1024 # 1 Kibibyte - progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) - with open(outf, "wb") as file: - for data in response.iter_content(block_size): - progress_bar.update(len(data)) - file.write(data) - progress_bar.close() - if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: - print("ERROR, something went wrong") - print(f"Downloaded successfully to {outf}") - p_ckpt = outf - sd = torch.load(p_ckpt, map_location="cpu") - unet_lora_config = LoraConfig(r=sd["rank_unet"], init_lora_weights="gaussian", target_modules=sd["unet_lora_target_modules"]) - vae_lora_config = LoraConfig(r=sd["rank_vae"], init_lora_weights="gaussian", target_modules=sd["vae_lora_target_modules"]) - vae.add_adapter(vae_lora_config, adapter_name="vae_skip") - _sd_vae = vae.state_dict() - for k in sd["state_dict_vae"]: - _sd_vae[k] = sd["state_dict_vae"][k] - vae.load_state_dict(_sd_vae) - unet.add_adapter(unet_lora_config) - _sd_unet = unet.state_dict() - for k in sd["state_dict_unet"]: - _sd_unet[k] = sd["state_dict_unet"][k] - unet.load_state_dict(_sd_unet) - - elif pretrained_name == "sketch_to_image_stochastic": - # download from url - url = "https://www.cs.cmu.edu/~img2img-turbo/models/sketch_to_image_stochastic_lora.pkl" - ckpt_folder.mkdir(exist_ok=True) - outf = ckpt_folder / "sketch_to_image_stochastic_lora.pkl" - if not outf.exists(): - print(f"Downloading checkpoint to {outf}") - response = requests.get(url, stream=True) - total_size_in_bytes = int(response.headers.get("content-length", 0)) - block_size = 1024 # 1 Kibibyte - progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) - with open(outf, "wb") as file: - for data in response.iter_content(block_size): - progress_bar.update(len(data)) - file.write(data) - progress_bar.close() - if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: - print("ERROR, something went wrong") - print(f"Downloaded successfully to {outf}") - p_ckpt = outf - convin_pretrained = copy.deepcopy(unet.conv_in) - unet.conv_in = TwinConv(convin_pretrained, unet.conv_in) - sd = torch.load(p_ckpt, map_location="cpu") - unet_lora_config = LoraConfig(r=sd["rank_unet"], init_lora_weights="gaussian", target_modules=sd["unet_lora_target_modules"]) - vae_lora_config = LoraConfig(r=sd["rank_vae"], init_lora_weights="gaussian", target_modules=sd["vae_lora_target_modules"]) - vae.add_adapter(vae_lora_config, adapter_name="vae_skip") - _sd_vae = vae.state_dict() - for k in sd["state_dict_vae"]: - if k not in _sd_vae: - continue - _sd_vae[k] = sd["state_dict_vae"][k] - - vae.load_state_dict(_sd_vae) - unet.add_adapter(unet_lora_config) - _sd_unet = unet.state_dict() - for k in sd["state_dict_unet"]: - _sd_unet[k] = sd["state_dict_unet"][k] - unet.load_state_dict(_sd_unet) - - elif pretrained_path is not None: - sd = torch.load(pretrained_path, map_location="cpu") - unet_lora_config = LoraConfig(r=sd["rank_unet"], init_lora_weights="gaussian", target_modules=sd["unet_lora_target_modules"]) - vae_lora_config = LoraConfig(r=sd["rank_vae"], init_lora_weights="gaussian", target_modules=sd["vae_lora_target_modules"]) - vae.add_adapter(vae_lora_config, adapter_name="vae_skip") - _sd_vae = vae.state_dict() - for k in sd["state_dict_vae"]: - _sd_vae[k] = sd["state_dict_vae"][k] - vae.load_state_dict(_sd_vae) - unet.add_adapter(unet_lora_config) - _sd_unet = unet.state_dict() - for k in sd["state_dict_unet"]: - _sd_unet[k] = sd["state_dict_unet"][k] - unet.load_state_dict(_sd_unet) - - # unet.enable_xformers_memory_efficient_attention() - unet.to("cpu") - vae.to("cpu") - self.unet, self.vae = unet, vae - self.vae.decoder.gamma = 1 - self.timesteps = torch.tensor([999], device="cpu").long() - self.text_encoder.requires_grad_(False) - - def set_r(self, r): - self.unet.set_adapters(["default"], weights=[r]) - set_weights_and_activate_adapters(self.vae, ["vae_skip"], [r]) - self.r = r - self.unet.conv_in.r = r - self.vae.decoder.gamma = r - - def forward(self, c_t, prompt_tokens, noise_map): - caption_enc = self.text_encoder(prompt_tokens)[0] - # scale the lora weights based on the r value - sample, current_down_blocks = self.vae.encode(c_t) - encoded_control = sample.sample() * self.vae.config.scaling_factor - # combine the input and noise - unet_input = encoded_control * self.r + noise_map * (1 - self.r) - - unet_output = self.unet( - unet_input, - self.timesteps, - encoder_hidden_states=caption_enc, - ).sample - x_denoised = self.sched.step(unet_output, self.timesteps, unet_input, return_dict=True).prev_sample - output_image = (self.vae.decode(x_denoised / self.vae.config.scaling_factor, current_down_blocks)[0]).clamp(-1, 1) - return output_image - -.. code:: ipython3 - - ov_model_path = Path("model/pix2pix-turbo.xml") - - pt_model = None - - if not ov_model_path.exists(): - pt_model = Pix2PixTurbo("sketch_to_image_stochastic") - pt_model.set_r(0.4) - pt_model.eval() - - - -.. parsed-literal:: - - model.fp16.safetensors: 0%| | 0.00/681M [00:00`__ -to take the advantage of advanced OpenVINO optimization tools and -features. You need to provide a model object, input data for model -tracing to `OpenVINO Model Conversion -API `__. -``ov.convert_model`` function convert PyTorch model instance to -``ov.Model`` object that can be used for compilation on device or saved -on disk using ``ov.save_model`` in compressed to FP16 format. - -.. code:: ipython3 - - import gc - import openvino as ov - - if not ov_model_path.exists(): - example_input = [torch.ones((1, 3, 512, 512)), torch.ones([1, 77], dtype=torch.int64), torch.ones([1, 4, 64, 64])] - with torch.no_grad(): - ov_model = ov.convert_model(pt_model, example_input=example_input, input=[[1, 3, 512, 512], [1, 77], [1, 4, 64, 64]]) - ov.save_model(ov_model, ov_model_path) - del ov_model - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - del pt_model - gc.collect(); - - # uncomment these lines if you want cleenup download pytorch model checkpoints - - # import shutil - - # checkpoints_dir = Path("checkpoints") - # for file in checkpoints_dir.glob("*"): - # shutil.rmtree(file, ignore_errors=True) - -Select inference device ------------------------ - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO') - - - -Compile model -------------- - - - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - compiled_model = core.compile_model(ov_model_path, device.value) - -Run model inference -------------------- - - - -Now, let’s try model in action and turn simple cat sketch into -professional artwork. - -.. code:: ipython3 - - from diffusers.utils import load_image - - sketch_image_path = Path("sketch.png") - if not sketch_image_path.exists(): - sketch_image = load_image("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/f964a51d-34e8-411a-98f4-5f97a28f56b0") - sketch_image.save(sketch_image_path) - else: - sketch_image = load_image(sketch_image_path) - - sketch_image - - - - -.. image:: sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_15_0.png - - - -.. code:: ipython3 - - import torchvision.transforms.functional as F - - torch.manual_seed(145) - c_t = torch.unsqueeze(F.to_tensor(sketch_image) > 0.5, 0) - noise = torch.randn((1, 4, 512 // 8, 512 // 8)) - -.. code:: ipython3 - - prompt_template = "anime artwork {prompt} . anime style, key visual, vibrant, studio anime, highly detailed" - prompt = prompt_template.replace("{prompt}", "fluffy magic cat") - - prompt_tokens = tokenize_prompt(prompt) - -.. code:: ipython3 - - result = compiled_model([1 - c_t.to(torch.float32), prompt_tokens, noise])[0] - -.. code:: ipython3 - - from PIL import Image - import numpy as np - - image_tensor = (result[0] * 0.5 + 0.5) * 255 - image = np.transpose(image_tensor, (1, 2, 0)).astype(np.uint8) - Image.fromarray(image) - - - - -.. image:: sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.png - - - -Interactive demo ----------------- - - - -In this section, you can try model on own paintings. - -**Instructions:** \* Enter a text prompt (e.g. cat) \* Start sketching, -using pencil and eraser buttons \* Change the image style using a style -template \* Try different seeds to generate different results \* -Download results using download button - -.. code:: ipython3 - - import base64 - from io import BytesIO - import gradio as gr - - - def pil_image_to_data_uri(img, format="PNG"): - buffered = BytesIO() - img.save(buffered, format=format) - img_str = base64.b64encode(buffered.getvalue()).decode() - return f"data:image/{format.lower()};base64,{img_str}" - - - def run(image, prompt, prompt_template, style_name, seed): - print(f"prompt: {prompt}") - print("sketch updated") - if image is None: - ones = Image.new("L", (512, 512), 255) - temp_uri = pil_image_to_data_uri(ones) - return ones, gr.update(link=temp_uri), gr.update(link=temp_uri) - prompt = prompt_template.replace("{prompt}", prompt) - image = image.convert("RGB") - image_t = F.to_tensor(image) > 0.5 - print(f"seed={seed}") - caption_tokens = tokenizer(prompt, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt").input_ids.cpu() - with torch.no_grad(): - c_t = image_t.unsqueeze(0) - torch.manual_seed(seed) - B, C, H, W = c_t.shape - noise = torch.randn((1, 4, H // 8, W // 8)) - output_image = torch.from_numpy(compiled_model([c_t.to(torch.float32), caption_tokens, noise])[0]) - output_pil = F.to_pil_image(output_image[0].cpu() * 0.5 + 0.5) - input_sketch_uri = pil_image_to_data_uri(Image.fromarray(255 - np.array(image))) - output_image_uri = pil_image_to_data_uri(output_pil) - return ( - output_pil, - gr.update(link=input_sketch_uri), - gr.update(link=output_image_uri), - ) - -.. code:: ipython3 - - # Go back to the sketch-to-image-pix2pix-turbo notebook directory - %cd .. - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/sketch-to-image-pix2pix-turbo/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=run) - - try: - demo.queue().launch(debug=True) - except Exception: - demo.queue().launch(debug=True, share=True) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ diff --git a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_15_0.jpg b/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_15_0.jpg deleted file mode 100644 index d44930d1295f7d..00000000000000 --- a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_15_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ecc12808bb01e9203f487cfacfb487e613b945d428c0c8b8bd4543b320645058 -size 12027 diff --git a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_15_0.png b/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_15_0.png deleted file mode 100644 index f9551560338c5c..00000000000000 --- a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_15_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:64c9d35bc76286eaab5acf7b3449a599139ccae9533be2f39bc61ff9dc127522 -size 27834 diff --git a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.jpg b/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.jpg deleted file mode 100644 index 7c1d98ac61eebc..00000000000000 --- a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:60f756dfe6e1551104f5aa1553ba9427c80d11eb4912196157314e312451eb01 -size 25480 diff --git a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.png b/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.png deleted file mode 100644 index 72bebad9519bc7..00000000000000 --- a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f1992a0759f4fe1ba268e5b907fa99348d43295e2c9a2a088ec1199afb4eb643 -size 321829 diff --git a/docs/notebooks/softvc-voice-conversion-with-output.rst b/docs/notebooks/softvc-voice-conversion-with-output.rst deleted file mode 100644 index 192b5312cc6777..00000000000000 --- a/docs/notebooks/softvc-voice-conversion-with-output.rst +++ /dev/null @@ -1,340 +0,0 @@ -SoftVC VITS Singing Voice Conversion and OpenVINO™ -================================================== - -This tutorial is based on `SoftVC VITS Singing Voice Conversion -project `__. The -purpose of this project was to enable developers to have their beloved -anime characters perform singing tasks. The developers’ intention was to -focus solely on fictional characters and avoid any involvement of real -individuals, anything related to real individuals deviates from the -developer’s original intention. - -The singing voice conversion model uses SoftVC content encoder to -extract speech features from the source audio. These feature vectors are -directly fed into `VITS `__ -without the need for conversion to a text-based intermediate -representation. As a result, the pitch and intonations of the original -audio are preserved. - -In this tutorial we will use the base model flow. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Use the original model to run an - inference <#use-the-original-model-to-run-an-inference>`__ -- `Convert to OpenVINO IR model <#convert-to-openvino-ir-model>`__ -- `Run the OpenVINO model <#run-the-openvino-model>`__ -- `Interactive inference <#interactive-inference>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - -This model has some problems -with installation. So, we have to downgrade ``pip`` to version below -``24.1`` to avoid problems with ``fairseq``. Also, ``fairseq`` has some -problems on ``Windows`` on ``python>=3.11`` and we use a custom version -of this library to resolve them. - -.. code:: ipython3 - - import platform - import sys - import warnings - - - %pip install -qU "pip<24.1" - if platform.system() == "Windows" and sys.version_info >= (3, 11, 0): - warnings.warn( - "Building the custmom fairseq package may take a long time and may require additional privileges in the system. We recommend using Python versions 3.8, 3.9 or 3.10 for this model." - ) - %pip install git+https://github.com/aleksandr-mokrov/fairseq.git --extra-index-url https://download.pytorch.org/whl/cpu - else: - %pip install "fairseq==0.12.2" --extra-index-url https://download.pytorch.org/whl/cpu - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w", encoding="utf-8").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("softvc-voice-conversion.ipynb") - -.. code:: ipython3 - - from cmd_helper import clone_repo - - - %pip install -q "openvino>=2023.2.0" - clone_repo("https://github.com/svc-develop-team/so-vits-svc", revision="4.1-Stable", add_to_sys_path=False) - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu tqdm librosa "torch>=2.1.0,<2.6.0" "torchaudio>=2.1.0,<2.6.0" faiss-cpu "gradio>=4.19" "numpy>=1.23.5" praat-parselmouth - -Download pretrained models and configs. We use a recommended encoder -`ContentVec `__ and models from `a -collection of so-vits-svc-4.0 models made by the Pony Preservation -Project `__ for -example. You can choose any other pretrained model from this or another -project or `prepare your -own `__. - -.. code:: ipython3 - - import os - - from notebook_utils import download_file, device_widget - - # ContentVec - if not Path("so-vits-svc/pretrain/checkpoint_best_legacy_500.pt").exists(): - download_file( - "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt", - "checkpoint_best_legacy_500.pt", - directory="so-vits-svc/pretrain/", - ) - - # pretrained models and configs from a collection of so-vits-svc-4.0 models. You can use other models. - if not Path("so-vits-svc/logs/44k/kmeans_10000.pt").exists(): - download_file( - "https://huggingface.co/therealvul/so-vits-svc-4.0/resolve/main/Rainbow%20Dash%20(singing)/kmeans_10000.pt", - "kmeans_10000.pt", - directory="so-vits-svc/logs/44k/", - ) - if os.stat("so-vits-svc/configs/config.json").st_size == 0: # cleanup if it is default empty file - os.remove("so-vits-svc/configs/config.json") - - if not Path("so-vits-svc/configs/config.json").exists(): - download_file( - "https://huggingface.co/therealvul/so-vits-svc-4.0/resolve/main/Rainbow%20Dash%20(singing)/config.json", - "config.json", - directory="so-vits-svc/configs/", - ) - if not Path("so-vits-svc/logs/44k/G_30400.pth").exists(): - download_file( - "https://huggingface.co/therealvul/so-vits-svc-4.0/resolve/main/Rainbow%20Dash%20(singing)/G_30400.pth", - "G_30400.pth", - directory="so-vits-svc/logs/44k/", - ) - if not Path("so-vits-svc/logs/44k/D_30400.pth").exists(): - download_file( - "https://huggingface.co/therealvul/so-vits-svc-4.0/resolve/main/Rainbow%20Dash%20(singing)/D_30400.pth", - "D_30400.pth", - directory="so-vits-svc/logs/44k/", - ) - - # a wav sample - if not Path("so-vits-svc/raw/000.wav").exists(): - download_file( - "https://huggingface.co/datasets/santifiorino/spinetta/resolve/main/spinetta/000.wav", - "000.wav", - directory="so-vits-svc/raw/", - ) - -Use the original model to run an inference ------------------------------------------- - - - -Change directory to ``so-vits-svc`` in purpose not to brake internal -relative paths. - -.. code:: ipython3 - - %cd so-vits-svc - -Define the Sovits Model. - -.. code:: ipython3 - - from inference.infer_tool import Svc - - model = Svc("logs/44k/G_30400.pth", "configs/config.json", device="cpu") - -Define ``kwargs`` and make an inference. - -.. code:: ipython3 - - kwargs = { - "raw_audio_path": "raw/000.wav", # path to a source audio - "spk": "Rainbow Dash (singing)", # speaker ID in which the source audio should be converted. - "tran": 0, - "slice_db": -40, - "cluster_infer_ratio": 0, - "auto_predict_f0": False, - "noice_scale": 0.4, - } - - audio = model.slice_inference(**kwargs) - -And let compare the original audio with the result. - -.. code:: ipython3 - - import IPython.display as ipd - - # original - ipd.Audio("raw/000.wav", rate=model.target_sample) - -.. code:: ipython3 - - # result - ipd.Audio(audio, rate=model.target_sample) - -Convert to OpenVINO IR model ----------------------------- - - - -Model components are PyTorch modules, that can be converted with -``ov.convert_model`` function directly. We also use ``ov.save_model`` -function to serialize the result of conversion. ``Svc`` is not a model, -it runs model inference inside. In base scenario only ``SynthesizerTrn`` -named ``net_g_ms`` is used. It is enough to convert only this model and -we should re-assign ``forward`` method on ``infer`` method for this -purpose. - -``SynthesizerTrn`` uses several models inside it’s flow, -i.e. \ ``TextEncoder``, ``Generator``, ``ResidualCouplingBlock``, etc., -but in our case OpenVINO allows to convert whole pipeline by one step -without need to look inside. - -.. code:: ipython3 - - import openvino as ov - import torch - from pathlib import Path - - - dummy_c = torch.randn(1, 256, 813) - dummy_f0 = torch.randn(1, 813) - dummy_uv = torch.ones(1, 813) - dummy_g = torch.tensor([[0]]) - model.net_g_ms.forward = model.net_g_ms.infer - - net_g_kwargs = { - "c": dummy_c, - "f0": dummy_f0, - "uv": dummy_uv, - "g": dummy_g, - "noice_scale": torch.tensor(0.35), # need to wrap numeric and boolean values for conversion - "seed": torch.tensor(52468), - "predict_f0": torch.tensor(False), - "vol": torch.tensor(0), - } - core = ov.Core() - - - net_g_model_xml_path = Path("models/ov_net_g_model.xml") - - if not net_g_model_xml_path.exists(): - converted_model = ov.convert_model(model.net_g_ms, example_input=net_g_kwargs) - net_g_model_xml_path.parent.mkdir(parents=True, exist_ok=True) - ov.save_model(converted_model, net_g_model_xml_path) - -Run the OpenVINO model ----------------------- - - - -Select a device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - - device = device_widget() - - device - -We should create a wrapper for ``net_g_ms`` model to keep it’s -interface. Then replace ``net_g_ms`` original model by the converted IR -model. We use ``ov.compile_model`` to make it ready to use for loading -on a device. - -.. code:: ipython3 - - class NetGModelWrapper: - def __init__(self, net_g_model_xml_path): - super().__init__() - self.net_g_model = core.compile_model(net_g_model_xml_path, device.value) - - def infer(self, c, *, f0, uv, g, noice_scale=0.35, seed=52468, predict_f0=False, vol=None): - if vol is None: # None is not allowed as an input - results = self.net_g_model((c, f0, uv, g, noice_scale, seed, predict_f0)) - else: - results = self.net_g_model((c, f0, uv, g, noice_scale, seed, predict_f0, vol)) - - return torch.from_numpy(results[0]), torch.from_numpy(results[1]) - - - model.net_g_ms = NetGModelWrapper(net_g_model_xml_path) - audio = model.slice_inference(**kwargs) - -Check result. Is it identical to that created by the original model. - -.. code:: ipython3 - - import IPython.display as ipd - - ipd.Audio(audio, rate=model.target_sample) - -Interactive inference ---------------------- - - - -.. code:: ipython3 - - def infer(src_audio, tran, slice_db, noice_scale): - kwargs["raw_audio_path"] = src_audio - kwargs["tran"] = tran - kwargs["slice_db"] = slice_db - kwargs["noice_scale"] = noice_scale - - audio = model.slice_inference(**kwargs) - - return model.target_sample, audio - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/softvc-voice-conversion/gradio_helper.py") - open("gradio_helper.py", "w", encoding="utf-8").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=infer) - - try: - demo.queue().launch(debug=False) - except Exception: - demo.queue().launch(share=True, debug=False) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ diff --git a/docs/notebooks/sound-generation-audioldm2-with-output.rst b/docs/notebooks/sound-generation-audioldm2-with-output.rst deleted file mode 100644 index 9d94a7bf9dfc5e..00000000000000 --- a/docs/notebooks/sound-generation-audioldm2-with-output.rst +++ /dev/null @@ -1,855 +0,0 @@ -Sound Generation with AudioLDM2 and OpenVINO™ -============================================= - -`AudioLDM 2 `__ is a latent -text-to-audio diffusion model capable of generating realistic audio -samples given any text input. - -AudioLDM 2 was proposed in the paper `AudioLDM 2: Learning Holistic -Audio Generation with Self-supervised -Pretraining `__ by ``Haohe Liu`` et -al. - -The model takes a text prompt as input and predicts the corresponding -audio. It can generate text-conditional sound effects, human speech and -music. - -|image0| - -In this tutorial we will try out the pipeline, convert the models -backing it one by one and will run an interactive app with Gradio! - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Instantiating Generation - Pipeline <#instantiating-generation-pipeline>`__ -- `Convert models to OpenVINO Intermediate representation (IR) - format <#convert-models-to-openvino-intermediate-representation-ir-format>`__ - - - `CLAP Text Encoder Conversion <#clap-text-encoder-conversion>`__ - - `T5 Text Encoder Conversion <#t5-text-encoder-conversion>`__ - - `Projection model conversion <#projection-model-conversion>`__ - - `GPT-2 conversion <#gpt-2-conversion>`__ - - `Vocoder conversion <#vocoder-conversion>`__ - - `UNet conversion <#unet-conversion>`__ - - `VAE Decoder conversion <#vae-decoder-conversion>`__ - -- `Select inference device for AudioLDM2 - pipeline <#select-inference-device-for-audioldm2-pipeline>`__ -- `Adapt OpenVINO models to the original - pipeline <#adapt-openvino-models-to-the-original-pipeline>`__ -- `Try out the converted pipeline <#try-out-the-converted-pipeline>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. |image0| image:: https://github.com/openvinotoolkit/openvino_notebooks/assets/76463150/c93a0f86-d9cf-4bd1-93b9-e27532170d75 - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -qU accelerate "diffusers>=0.30.0" "transformers>=4.43" "torch>=2.1" "gradio>=4.19" "peft>=0.6.2" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "openvino>=2024.0.0" - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("sound-generation-audioldm2.ipynb") - -Instantiating Generation Pipeline ---------------------------------- - - - -To work with `AudioLDM 2 `__ by -`Centre for Vision, Speech and Signal Processing - University of Surrey `__, -we will use `Hugging Face Diffusers -package `__. Diffusers package -exposes the ``AudioLDM2Pipeline`` class, simplifying the model -instantiation and weights loading. The code below demonstrates how to -create a ``AudioLDM2Pipeline`` and generate a text-conditioned sound -sample. - -.. code:: ipython3 - - from collections import namedtuple - from functools import partial - import gc - from pathlib import Path - - from diffusers import AudioLDM2Pipeline - from IPython.display import Audio - import numpy as np - import openvino as ov - import torch - - MODEL_ID = "cvssp/audioldm2" - pipe = AudioLDM2Pipeline.from_pretrained(MODEL_ID) - - prompt = "birds singing in the forest" - negative_prompt = "Low quality" - audio = pipe( - prompt, - negative_prompt=negative_prompt, - num_inference_steps=150, - audio_length_in_s=3.0, - ).audios[0] - - sampling_rate = 16000 - Audio(audio, rate=sampling_rate) - - -.. parsed-literal:: - - 2024-08-14 08:47:45.295561: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-08-14 08:47:45.297388: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used. - 2024-08-14 08:47:45.331810: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-08-14 08:47:45.955688: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - - - -.. parsed-literal:: - - Fetching 26 files: 0%| | 0/26 [00:00 - - Your browser does not support the audio element. - - - - - -Convert models to OpenVINO Intermediate representation (IR) format ------------------------------------------------------------------- - - - -`Model conversion -API `__ -enables direct conversion of PyTorch models backing the pipeline. We -need to provide a model object, input data for model tracing to -``ov.convert_model`` function to obtain OpenVINO ``ov.Model`` object -instance. Model can be saved on disk for next deployment using -``ov.save_model`` function. - -The pipeline consists of seven important parts: - -- T5 and CLAP Text Encoders for creation condition to generate an sound - from a text prompt. -- Projection model to merge outputs from the two text encoders. -- GPT-2 language model to generate a sequence of hidden-states - conditioned on the projected outputs from the two text encoders. -- Vocoder to convert the mel-spectrogram latents to the final audio - waveform. -- Unet for step-by-step denoising latent image representation. -- Autoencoder (VAE) for decoding latent space to image. - -.. code:: ipython3 - - models_base_folder = Path("models") - - - def cleanup_torchscript_cache(): - """ - Helper for removing cached model representation - """ - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - -CLAP Text Encoder Conversion -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -First frozen text-encoder. AudioLDM2 uses the joint audio-text embedding -model -`CLAP `__, -specifically the -`laion/clap-htsat-unfused `__ -variant. The text branch is used to encode the text prompt to a prompt -embedding. The full audio-text model is used to rank generated waveforms -against the text prompt by computing similarity scores. - -.. code:: ipython3 - - class ClapEncoderWrapper(torch.nn.Module): - def __init__(self, encoder): - super().__init__() - encoder.eval() - self.encoder = encoder - - def forward(self, input_ids, attention_mask): - return self.encoder.get_text_features(input_ids, attention_mask) - - - clap_text_encoder_ir_path = models_base_folder / "clap_text_encoder.xml" - - if not clap_text_encoder_ir_path.exists(): - with torch.no_grad(): - ov_model = ov.convert_model( - ClapEncoderWrapper(pipe.text_encoder), # model instance - example_input={ - "input_ids": torch.ones((1, 512), dtype=torch.long), - "attention_mask": torch.ones((1, 512), dtype=torch.long), - }, # inputs for model tracing - ) - ov.save_model(ov_model, clap_text_encoder_ir_path) - del ov_model - cleanup_torchscript_cache() - gc.collect() - print("Text Encoder successfully converted to IR") - else: - print(f"Text Encoder will be loaded from {clap_text_encoder_ir_path}") - - -.. parsed-literal:: - - WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11. - - -.. parsed-literal:: - - [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/transformers/modeling_utils.py:4664: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead - warnings.warn( - - -.. parsed-literal:: - - Text Encoder successfully converted to IR - - -T5 Text Encoder Conversion -~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -As second frozen text-encoder, AudioLDM2 uses the -`T5 `__, -specifically the -`google/flan-t5-large `__ -variant. - -The text-encoder is responsible for transforming the input prompt, for -example, “birds singing in the forest” into an embedding space that can -be understood by the U-Net. It is usually a simple transformer-based -encoder that maps a sequence of input tokens to a sequence of latent -text embeddings. - -The input of the text encoder is tensor ``input_ids``, which contains -indexes of tokens from text processed by the tokenizer and padded to the -maximum length accepted by the model. Model outputs are two tensors: -``last_hidden_state`` - hidden state from the last MultiHeadAttention -layer in the model and ``pooler_out`` - pooled output for whole model -hidden states. - -.. code:: ipython3 - - t5_text_encoder_ir_path = models_base_folder / "t5_text_encoder.xml" - - if not t5_text_encoder_ir_path.exists(): - pipe.text_encoder_2.eval() - with torch.no_grad(): - ov_model = ov.convert_model( - pipe.text_encoder_2, # model instance - example_input=torch.ones((1, 7), dtype=torch.long), # inputs for model tracing - ) - ov.save_model(ov_model, t5_text_encoder_ir_path) - del ov_model - cleanup_torchscript_cache() - gc.collect() - print("Text Encoder successfully converted to IR") - else: - print(f"Text Encoder will be loaded from {t5_text_encoder_ir_path}") - - -.. parsed-literal:: - - Text Encoder successfully converted to IR - - -Projection model conversion -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -A trained model used to linearly project the hidden-states from the -first and second text encoder models and insert learned Start Of -Sequence and End Of Sequence token embeddings. The projected -hidden-states from the two text encoders are concatenated to give the -input to the language model. - -.. code:: ipython3 - - projection_model_ir_path = models_base_folder / "projection_model.xml" - - projection_model_inputs = { - "hidden_states": torch.randn((1, 1, 512), dtype=torch.float32), - "hidden_states_1": torch.randn((1, 7, 1024), dtype=torch.float32), - "attention_mask": torch.ones((1, 1), dtype=torch.int64), - "attention_mask_1": torch.ones((1, 7), dtype=torch.int64), - } - - if not projection_model_ir_path.exists(): - pipe.projection_model.eval() - with torch.no_grad(): - ov_model = ov.convert_model( - pipe.projection_model, # model instance - example_input=projection_model_inputs, # inputs for model tracing - ) - ov.save_model(ov_model, projection_model_ir_path) - del ov_model - cleanup_torchscript_cache() - gc.collect() - print("The Projection Model successfully converted to IR") - else: - print(f"The Projection Model will be loaded from {projection_model_ir_path}") - - -.. parsed-literal:: - - The Projection Model successfully converted to IR - - -GPT-2 conversion -~~~~~~~~~~~~~~~~ - - - -`GPT-2 `__ is an auto-regressive language -model used to generate a sequence of hidden-states conditioned on the -projected outputs from the two text encoders. - -.. code:: ipython3 - - language_model_ir_path = models_base_folder / "language_model.xml" - - language_model_inputs = { - "inputs_embeds": torch.randn((1, 12, 768), dtype=torch.float32), - "attention_mask": torch.ones((1, 12), dtype=torch.int64), - } - - if not language_model_ir_path.exists(): - pipe.language_model.config.torchscript = True - pipe.language_model.eval() - pipe.language_model.__call__ = partial( - pipe.language_model.__call__, - kwargs={"past_key_values": None, "use_cache": False, "return_dict": False}, - ) - with torch.no_grad(): - ov_model = ov.convert_model( - pipe.language_model, # model instance - example_input=language_model_inputs, # inputs for model tracing - ) - - ov_model.inputs[0].get_node().set_partial_shape(ov.PartialShape([1, -1])) - ov_model.inputs[0].get_node().set_element_type(ov.Type.i64) - ov_model.inputs[1].get_node().set_partial_shape(ov.PartialShape([1, -1, 768])) - ov_model.inputs[1].get_node().set_element_type(ov.Type.f32) - - ov_model.validate_nodes_and_infer_types() - - ov.save_model(ov_model, language_model_ir_path) - del ov_model - cleanup_torchscript_cache() - gc.collect() - print("The Projection Model successfully converted to IR") - else: - print(f"The Projection Model will be loaded from {language_model_ir_path}") - - -.. parsed-literal:: - - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:114: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal: - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:162: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if past_key_values_length > 0: - - -.. parsed-literal:: - - The Projection Model successfully converted to IR - - -Vocoder conversion -~~~~~~~~~~~~~~~~~~ - - - -`SpeechT5 HiFi-GAN Vocoder `__ -is used to convert the mel-spectrogram latents to the final audio -waveform. - -.. code:: ipython3 - - vocoder_ir_path = models_base_folder / "vocoder.xml" - - if not vocoder_ir_path.exists(): - pipe.vocoder.eval() - with torch.no_grad(): - ov_model = ov.convert_model( - pipe.vocoder, # model instance - example_input=torch.ones((1, 700, 64), dtype=torch.float32), # inputs for model tracing - ) - ov.save_model(ov_model, vocoder_ir_path) - del ov_model - cleanup_torchscript_cache() - gc.collect() - print("The Vocoder successfully converted to IR") - else: - print(f"The Vocoder will be loaded from {vocoder_ir_path}") - - -.. parsed-literal:: - - The Vocoder successfully converted to IR - - -UNet conversion -~~~~~~~~~~~~~~~ - - - -The UNet model is used to denoise the encoded audio latents. The process -of UNet model conversion remains the same, like for original Stable -Diffusion model. - -.. code:: ipython3 - - unet_ir_path = models_base_folder / "unet.xml" - - pipe.unet.eval() - unet_inputs = { - "sample": torch.randn((2, 8, 75, 16), dtype=torch.float32), - "timestep": torch.tensor(1, dtype=torch.int64), - "encoder_hidden_states": torch.randn((2, 8, 768), dtype=torch.float32), - "encoder_hidden_states_1": torch.randn((2, 7, 1024), dtype=torch.float32), - "encoder_attention_mask_1": torch.ones((2, 7), dtype=torch.int64), - } - - if not unet_ir_path.exists(): - with torch.no_grad(): - ov_model = ov.convert_model(pipe.unet, example_input=unet_inputs) - - ov_model.inputs[0].get_node().set_partial_shape(ov.PartialShape((2, 8, -1, 16))) - ov_model.inputs[2].get_node().set_partial_shape(ov.PartialShape((2, 8, 768))) - ov_model.inputs[3].get_node().set_partial_shape(ov.PartialShape((2, -1, 1024))) - ov_model.inputs[4].get_node().set_partial_shape(ov.PartialShape((2, -1))) - ov_model.validate_nodes_and_infer_types() - - ov.save_model(ov_model, unet_ir_path) - - del ov_model - cleanup_torchscript_cache() - gc.collect() - print("Unet successfully converted to IR") - else: - print(f"Unet will be loaded from {unet_ir_path}") - - -.. parsed-literal:: - - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/diffusers/pipelines/audioldm2/modeling_audioldm2.py:736: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]): - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/diffusers/models/downsampling.py:136: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert hidden_states.shape[1] == self.channels - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/diffusers/models/downsampling.py:145: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert hidden_states.shape[1] == self.channels - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/diffusers/models/attention_processor.py:613: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if current_length != target_length: - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/diffusers/models/attention_processor.py:628: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if attention_mask.shape[0] < batch_size * head_size: - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/diffusers/models/upsampling.py:146: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - assert hidden_states.shape[1] == self.channels - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/diffusers/models/upsampling.py:162: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - if hidden_states.shape[0] >= 64: - - -.. parsed-literal:: - - Unet successfully converted to IR - - -VAE Decoder conversion -~~~~~~~~~~~~~~~~~~~~~~ - - - -The VAE model has two parts, an encoder, and a decoder. The encoder is -used to convert the image into a low-dimensional latent representation, -which will serve as the input to the U-Net model. The decoder, -conversely, transforms the latent representation back into an image. - -During latent diffusion training, the encoder is used to get the latent -representations (latents) of the images for the forward diffusion -process, which applies more and more noise at each step. During -inference, the denoised latents generated by the reverse diffusion -process are converted back into images using the VAE decoder. During -inference, we will see that we **only need the VAE decoder**. You can -find instructions on how to convert the encoder part in a stable -diffusion -`notebook `__. - -.. code:: ipython3 - - vae_ir_path = models_base_folder / "vae.xml" - - - class VAEDecoderWrapper(torch.nn.Module): - def __init__(self, vae): - super().__init__() - vae.eval() - self.vae = vae - - def forward(self, latents): - return self.vae.decode(latents) - - - if not vae_ir_path.exists(): - vae_decoder = VAEDecoderWrapper(pipe.vae) - latents = torch.zeros((1, 8, 175, 16)) - - vae_decoder.eval() - with torch.no_grad(): - ov_model = ov.convert_model(vae_decoder, example_input=latents) - ov.save_model(ov_model, vae_ir_path) - del ov_model - cleanup_torchscript_cache() - gc.collect() - print("VAE decoder successfully converted to IR") - else: - print(f"VAE decoder will be loaded from {vae_ir_path}") - - -.. parsed-literal:: - - VAE decoder successfully converted to IR - - -Select inference device for AudioLDM2 pipeline ----------------------------------------------- - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - core = ov.Core() - - device = device_widget() - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -Adapt OpenVINO models to the original pipeline ----------------------------------------------- - - - -Here we create wrapper classes for all three OpenVINO models that we -want to embed in the original inference pipeline. Here are some of the -things to consider when adapting an OV model: - Make sure that -parameters passed by the original pipeline are forwarded to the compiled -OV model properly; sometimes the OV model uses only a portion of the -input arguments and some are ignored, sometimes you need to convert the -argument to another data type or unwrap some data structures such as -tuples or dictionaries. - Do guarantee that the wrapper class returns -results to the pipeline in an expected format. In the example below you -can see how we pack OV model outputs into special named tuples to adapt -them for the pipeline. - Pay attention to the model method used in the -original pipeline for calling the model - it may be not the ``forward`` -method! Refer to the ``OVClapEncoderWrapper`` to see how we wrap OV -model inference into the ``get_text_features`` method. - -.. code:: ipython3 - - class OVClapEncoderWrapper: - def __init__(self, encoder_ir, config): - self.encoder = core.compile_model(encoder_ir, device.value) - self.config = config - - def get_text_features(self, input_ids, attention_mask, **_): - last_hidden_state = self.encoder([input_ids, attention_mask])[0] - return torch.from_numpy(last_hidden_state) - - - class OVT5EncoderWrapper: - def __init__(self, encoder_ir, config): - self.encoder = core.compile_model(encoder_ir, device.value) - self.config = config - self.dtype = self.config.torch_dtype - - def __call__(self, input_ids, **_): - last_hidden_state = self.encoder(input_ids)[0] - return torch.from_numpy(last_hidden_state)[None, ...] - - - class OVVocoderWrapper: - def __init__(self, vocoder_ir, config): - self.vocoder = core.compile_model(vocoder_ir, device.value) - self.config = config - - def __call__(self, mel_spectrogram, **_): - waveform = self.vocoder(mel_spectrogram)[0] - return torch.from_numpy(waveform) - - - class OVProjectionModelWrapper: - def __init__(self, proj_model_ir, config): - self.proj_model = core.compile_model(proj_model_ir, device.value) - self.config = config - self.output_type = namedtuple("ProjectionOutput", ["hidden_states", "attention_mask"]) - - def __call__(self, hidden_states, hidden_states_1, attention_mask, attention_mask_1, **_): - output = self.proj_model( - { - "hidden_states": hidden_states, - "hidden_states_1": hidden_states_1, - "attention_mask": attention_mask, - "attention_mask_1": attention_mask_1, - } - ) - return self.output_type(torch.from_numpy(output[0]), torch.from_numpy(output[1])) - - - class OVUnetWrapper: - def __init__(self, unet_ir, config): - self.unet = core.compile_model(unet_ir, device.value) - self.config = config - - def __call__(self, sample, timestep, encoder_hidden_states, encoder_hidden_states_1, encoder_attention_mask_1, **_): - output = self.unet( - { - "sample": sample, - "timestep": timestep, - "encoder_hidden_states": encoder_hidden_states, - "encoder_hidden_states_1": encoder_hidden_states_1, - "encoder_attention_mask_1": encoder_attention_mask_1, - } - ) - return (torch.from_numpy(output[0]),) - - - class OVVaeDecoderWrapper: - def __init__(self, vae_ir, config): - self.vae = core.compile_model(vae_ir, device.value) - self.config = config - self.output_type = namedtuple("VaeOutput", ["sample"]) - - def decode(self, latents, **_): - last_hidden_state = self.vae(latents)[0] - return self.output_type(torch.from_numpy(last_hidden_state)) - - - def generate_language_model(gpt_2: ov.CompiledModel, inputs_embeds: torch.Tensor, attention_mask: torch.Tensor, max_new_tokens: int = 8, **_) -> torch.Tensor: - """ - Generates a sequence of hidden-states from the language model, conditioned on the embedding inputs. - """ - if not max_new_tokens: - max_new_tokens = 8 - inputs_embeds = inputs_embeds.cpu().numpy() - attention_mask = attention_mask.cpu().numpy() - for _ in range(max_new_tokens): - # forward pass to get next hidden states - output = gpt_2({"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}) - - next_hidden_states = output[0] - - # Update the model input - inputs_embeds = np.concatenate([inputs_embeds, next_hidden_states[:, -1:, :]], axis=1) - attention_mask = np.concatenate([attention_mask, np.ones((attention_mask.shape[0], 1))], axis=1) - return torch.from_numpy(inputs_embeds[:, -max_new_tokens:, :]) - -Now we initialize the wrapper objects and load them to the HF pipeline - -.. code:: ipython3 - - pipe = AudioLDM2Pipeline.from_pretrained(MODEL_ID) - pipe.config.torchscript = True - pipe.config.return_dict = False - - np.random.seed(0) - torch.manual_seed(0) - - pipe.text_encoder = OVClapEncoderWrapper(clap_text_encoder_ir_path, pipe.text_encoder.config) - pipe.text_encoder_2 = OVT5EncoderWrapper(t5_text_encoder_ir_path, pipe.text_encoder_2.config) - pipe.projection_model = OVProjectionModelWrapper(projection_model_ir_path, pipe.projection_model.config) - pipe.vocoder = OVVocoderWrapper(vocoder_ir_path, pipe.vocoder.config) - pipe.unet = OVUnetWrapper(unet_ir_path, pipe.unet.config) - pipe.vae = OVVaeDecoderWrapper(vae_ir_path, pipe.vae.config) - - pipe.generate_language_model = partial(generate_language_model, core.compile_model(language_model_ir_path, device.value)) - - gc.collect() - - prompt = "birds singing in the forest" - negative_prompt = "Low quality" - audio = pipe( - prompt, - negative_prompt=negative_prompt, - num_inference_steps=150, - audio_length_in_s=3.0, - ).audios[0] - - sampling_rate = 16000 - Audio(audio, rate=sampling_rate) - - - -.. parsed-literal:: - - Loading pipeline components...: 0%| | 0/11 [00:00 - - Your browser does not support the audio element. - - - - - -Try out the converted pipeline ------------------------------- - - - -Now, we are ready to start generation. For improving the generation -process, we also introduce an opportunity to provide a -``negative prompt``. Technically, positive prompt steers the diffusion -toward the output associated with it, while negative prompt steers the -diffusion away from it. The demo app below is created using `Gradio -package `__ - -.. code:: ipython3 - - import gradio as gr - - - def _generate( - prompt, - negative_prompt, - audio_length_in_s, - num_inference_steps, - _=gr.Progress(track_tqdm=True), - ): - """Gradio backing function.""" - audio_values = pipe( - prompt, - negative_prompt=negative_prompt, - num_inference_steps=num_inference_steps, - audio_length_in_s=audio_length_in_s, - ) - waveform = audio_values[0].squeeze() * 2**15 - return (sampling_rate, waveform.astype(np.int16)) - -.. code:: ipython3 - - import requests - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/sound-generation-audioldm2/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=_generate) - - try: - demo.queue().launch(debug=False) - except Exception: - demo.queue().launch(share=True, debug=False) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name="your server name", server_port="server port in int")` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ diff --git a/docs/notebooks/sparsity-optimization-with-output.rst b/docs/notebooks/sparsity-optimization-with-output.rst deleted file mode 100644 index 385429ac3c862c..00000000000000 --- a/docs/notebooks/sparsity-optimization-with-output.rst +++ /dev/null @@ -1,202 +0,0 @@ -Accelerate Inference of Sparse Transformer Models with OpenVINO™ and 4th Gen Intel® Xeon® Scalable Processors -============================================================================================================= - -This tutorial demonstrates how to improve performance of sparse -Transformer models with `OpenVINO `__ on 4th -Gen Intel® Xeon® Scalable processors. - -The tutorial downloads `a BERT-base -model `__ -which has been quantized, sparsified, and tuned for `SST2 -datasets `__ using -`Optimum-Intel `__. It -demonstrates the inference performance advantage on 4th Gen Intel® Xeon® -Scalable Processors by running it with `Sparse Weight -Decompression `__, -a runtime option that seizes model sparsity for efficiency. The notebook -consists of the following steps: - -- Install prerequisites -- Download and quantize sparse public BERT model, using the OpenVINO - integration with Hugging Face Optimum. -- Compare sparse 8-bit vs. dense 8-bit inference performance. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Imports <#imports>`__ - - - `Download, quantize and sparsify the model, using Hugging Face - Optimum - API <#download-quantize-and-sparsify-the-model-using-hugging-face-optimum-api>`__ - -- `Benchmark quantized dense inference - performance <#benchmark-quantized-dense-inference-performance>`__ -- `Benchmark quantized sparse inference - performance <#benchmark-quantized-sparse-inference-performance>`__ -- `When this might be helpful <#when-this-might-be-helpful>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "openvino>=2023.1.0" - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" "torch>=2.1" datasets "onnx<1.16.2" "transformers>=4.33.0" --extra-index-url https://download.pytorch.org/whl/cpu - -Imports -------- - - - -.. code:: ipython3 - - import requests - from pathlib import Path - - from optimum.intel.openvino import OVModelForSequenceClassification - from transformers import AutoTokenizer, pipeline - from huggingface_hub import hf_hub_download - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("sparsity-optimization.ipynb") - -Download, quantize and sparsify the model, using Hugging Face Optimum API -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The first step is to download a quantized sparse transformers which has -been translated to OpenVINO IR. Then, it will be put through a -classification as a simple validation of a working downloaded model. To -find out how the model is being quantized and sparsified, refer to the -`OpenVINO/bert-base-uncased-sst2-int8-unstructured80 `__ -model card on Hugging Face. - -.. code:: ipython3 - - import torch - - # The following model has been quantized, sparsified using Optimum-Intel 1.7 which is enabled by OpenVINO and NNCF - # for reproducibility, refer https://huggingface.co/OpenVINO/bert-base-uncased-sst2-int8-unstructured80 - model_id = "OpenVINO/bert-base-uncased-sst2-int8-unstructured80" - - # The following two steps will set up the model and download them to HF Cache folder - ov_model = OVModelForSequenceClassification.from_pretrained(model_id) - tokenizer = AutoTokenizer.from_pretrained(model_id) - - # Let's take the model for a spin! - sentiment_classifier = pipeline("text-classification", model=ov_model, tokenizer=tokenizer, device=torch.device("cpu")) - - text = "He's a dreadful magician." - outputs = sentiment_classifier(text) - - print(outputs) - -For benchmarking, we will use OpenVINO’s benchmark application and put -the IRs into a single folder. - -.. code:: ipython3 - - # create a folder - quantized_sparse_dir = Path("bert_80pc_sparse_quantized_ir") - quantized_sparse_dir.mkdir(parents=True, exist_ok=True) - - # following return path to specified filename in cache folder (which we've with the - ov_ir_xml_path = hf_hub_download(repo_id=model_id, filename="openvino_model.xml", local_dir=quantized_sparse_dir) - ov_ir_bin_path = hf_hub_download(repo_id=model_id, filename="openvino_model.bin", local_dir=quantized_sparse_dir) - -Benchmark quantized dense inference performance ------------------------------------------------ - - - -Benchmark dense inference performance using parallel execution on four -CPU cores to simulate a small instance in the cloud infrastructure. -Sequence length is dependent on use cases, 16 is common for -conversational AI while 160 for question answering task. It is set to 64 -as an example. It is recommended to tune based on your applications. - -.. code:: ipython3 - - # Dump benchmarking config for dense inference - with (quantized_sparse_dir / "perf_config.json").open("w") as outfile: - outfile.write( - """ - { - "CPU": {"NUM_STREAMS": 4, "INFERENCE_NUM_THREADS": 4} - } - """ - ) - -.. code:: ipython3 - - !benchmark_app -m $quantized_sparse_dir/openvino_model.xml -shape "input_ids[1,64],attention_mask[1,64],token_type_ids[1,64]" -load_config $quantized_sparse_dir/perf_config.json - -Benchmark quantized sparse inference performance ------------------------------------------------- - - - -To enable sparse weight decompression feature, users can add it to -runtime config like below. ``CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE`` -takes values between 0.5 and 1.0. It is a layer-level sparsity threshold -for which a layer will be enabled. - -.. code:: ipython3 - - # Dump benchmarking config for dense inference - # "CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE" controls minimum sparsity rate for weights to consider - # for sparse optimization at the runtime. - with (quantized_sparse_dir / "perf_config_sparse.json").open("w") as outfile: - outfile.write( - """ - { - "CPU": {"NUM_STREAMS": 4, "INFERENCE_NUM_THREADS": 4, "CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE": "0.75"} - } - """ - ) - -.. code:: ipython3 - - !benchmark_app -m $quantized_sparse_dir/openvino_model.xml -shape "input_ids[1,64],attention_mask[1,64],token_type_ids[1,64]" -load_config $quantized_sparse_dir/perf_config_sparse.json - -When this might be helpful --------------------------- - - - -This feature can improve inference performance for models with sparse -weights in the scenarios when the model is deployed to handle multiple -requests in parallel asynchronously. It is especially helpful with a -small sequence length, for example, 32 and lower. - -For more details about asynchronous inference with OpenVINO, refer to -the following documentation: - -- `Deployment Optimization - Guide `__ -- `Inference Request - API `__ diff --git a/docs/notebooks/speculative-sampling-with-output.rst b/docs/notebooks/speculative-sampling-with-output.rst deleted file mode 100644 index 779d5e0f265835..00000000000000 --- a/docs/notebooks/speculative-sampling-with-output.rst +++ /dev/null @@ -1,516 +0,0 @@ -Text Generation via Speculative Decoding using FastDraft and OpenVINO™ -====================================================================== - -As model sizes grow, Generative AI implementations require significant -inference resources. This not only increases the cost per generation -from a prompt, but also increases the power consumption used to serve -such requests. - -Inference optimizations for text generation are essential for reducing -costs and power consumption. When optimizing the inference process, the -amount of time and energy required to generate text can be significantly -reduced. This can lead to cost savings in terms of hardware and -software, as well as reduced power consumption. Additionally, inference -optimizations can help improve the accuracy of text generation as well -as the speed at which it can be generated. This can lead to an improved -user experience and increased efficiency in text-generation tasks. In -summary, inference optimizations for text generation are essential to -reduce costs and power consumption, while also improving the accuracy -and speed of text generation. - -Speculative decoding (or -`assisted-generation `__) -is a recent technique, that allows to speed up token generation when an -additional smaller draft model is used alongside with the main model. - -Speculative decoding works the following way. The draft model predicts -the next K tokens one by one in an autoregressive manner, while the main -model validates these predictions and corrects them if necessary. We go -through each predicted token, and if a difference is detected between -the draft and main model, we stop and keep the last token predicted by -the main model. Then the draft model gets the latest main prediction and -again tries to predict the next K tokens, repeating the cycle. - -This approach reduces the need for multiple infer requests to the main -model, enhancing performance. For instance, in more predictable parts of -text generation, the draft model can, in best-case scenarios, generate -the next K tokens that exactly match the target. In that case they are -validated in a single inference request to the main model (which is -bigger, more accurate but slower) instead of running K subsequent -requests. More details can be found in the original -`paper `__. - -|image0| - -Possibility to achieve significant speedup with Speculative Decoding is -highly depends on selection of a high-quality draft model that is both -efficient and well-aligned with the target. FastDraft is a novel and -efficient approach for pre-training and aligning a draft model to any -LLM to be used with speculative decoding, by incorporating efficient -pre-training followed by fine-tuning over synthetic datasets generated -by the target model. FastDraft was presented in the -`paper `__ at -`ENLSP@NeurIPS24 `__ -by Intel Labs. - -FastDraft pre-trained draft models achieve impressive results in key -metrics of acceptance rate, block efficiency and up to 3x memory bound -speed up when evaluated on code completion and up to 2x in -summarization, text completion and instruction tasks and unlock large -language models inference on AI-PC and other edge-devices. - -In this tutorial we consider how to apply Speculative decoding using -FastDraft and OpenVINO GenAI. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Prepare models <#prepare-models>`__ - - - `Select inference device <#select-inference-device>`__ - -- `Run target model without speculative - decoding <#run-target-model-without-speculative-decoding>`__ -- `Run Speculative decoding - pipeline <#run-speculative-decoding-pipeline>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. |image0| image:: https://github.com/user-attachments/assets/eb999dea-d98b-42bb-835e-28d3054e1a84 - -Prerequisites -------------- - - - -First, we should install the `OpenVINO -GenAI `__ for running -model inference. - -|image01| - -`OpenVINO™ GenAI `__ -is a library of the most popular Generative AI model pipelines, -optimized execution methods, and samples that run on top of highly -performant `OpenVINO -Runtime `__. - -This library is friendly to PC and laptop execution, and optimized for -resource consumption. It requires no external dependencies to run -generative models as it already includes all the core functionality -(e.g. tokenization via openvino-tokenizers). - -.. |image01| image:: https://media.githubusercontent.com/media/openvinotoolkit/openvino.genai/refs/heads/master/src/docs/openvino_genai.svg - -.. code:: ipython3 - - %pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly huggingface_hub datasets - -Prepare models --------------- - - - -As example, we will use already converted LLMs from `OpenVINO -collection `__. -You can find OpenVINO optimized FastDraft models can be found in this -`collection `__. -As example we will use -`Phi-3-mini-4k-instruct-int4-ov `__ -as target model and -`Phi-3-mini-FastDraft-50M-int8-ov `__ -as draft. - -In case, if you want run own models, you should convert them using -`Hugging Face -Optimum `__ -library accelerated by OpenVINO integration. More details about model -preparation can be found in `OpenVINO LLM inference -guide `__ - -.. code:: ipython3 - - from pathlib import Path - import huggingface_hub as hf_hub - - draft_model_id = "OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov" - target_model_id = "OpenVINO/Phi-3-mini-4k-instruct-int4-ov" - - draft_model_path = Path(draft_model_id.split("/")[-1]) - target_model_path = Path(target_model_id.split("/")[-1]) - - if not draft_model_path.exists(): - hf_hub.snapshot_download(draft_model_id, local_dir=draft_model_path) - if not target_model_path.exists(): - hf_hub.snapshot_download(target_model_id, local_dir=target_model_path) - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -Select the device from dropdown list for running inference using -OpenVINO. > **Note**: For achieving maximal performance, we recommend to -use GPU as target device if it is available. - -.. code:: ipython3 - - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import device_widget - - device = device_widget(default="CPU", exclude=["NPU", "AUTO"]) - - device - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("speculative-sampling.ipynb") - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'GPU'), value='CPU') - - - -Run target model without speculative decoding ---------------------------------------------- - - - -OpenVINO GenAI provides easy-to-use API for running text generation. -Firstly we will create pipeline with ``LLMPipeline``. ``LLMPipeline`` is -the main object used for decoding. You can construct it straight away -from the folder with the converted model. It will automatically load the -``main model``, ``tokenizer``, ``detokenizer`` and default -``generation configuration``. After that we will configure parameters -for decoding. Then we just run ``generate`` method and get the output in -text format. We do not need to encode input prompt according to model -expected template or write post-processing code for logits decoder, it -will be done easily with LLMPipeline. - -To obtain intermediate generation results without waiting until when -generation is finished, we will write streamer function. - -.. code:: ipython3 - - import openvino_genai as ov_genai - import time - - pipe = ov_genai.LLMPipeline(target_model_path, device.value) - - config = ov_genai.GenerationConfig() - config.max_new_tokens = 330 - prompt = ''' - - def prime_fib(n: int): - """ - prime_fib returns n-th number that is a Fibonacci number and it's also prime. - >>> prime_fib(1) - 2 - >>> prime_fib(2) - 3 - >>> prime_fib(3) - 5 - >>> prime_fib(4) - 13 - >>> prime_fib(5) - 89 - """''' - - - def streamer(subword): - print(subword, end="", flush=True) - # Return flag corresponds whether generation should be stopped. - # False means continue generation. - return False - - - start_time = time.perf_counter() - pipe.generate(prompt, config, streamer=streamer) - end_time = time.perf_counter() - -.. code:: ipython3 - - import gc - - print(f"Generation time: {end_time - start_time:.2f}s") - del pipe - gc.collect() - -Run Speculative decoding pipeline ---------------------------------- - - - -To enable Speculative decoding in ``LLMPipeline,`` we should -additionally provide the ``draft_model`` structure and -``SchedulerConfig`` for resource management. - -|image02| - -As shown in the figure above, speculative decoding works by splitting -the generative process into two stages. In the first stage, a fast, but -less accurate draft model (AKA assistant) autoregressively generates a -sequence of tokens. In the second stage, a large, but more accurate -target model conducts parallelized verification over the generated draft -tokens. This process allows the target model to produce multiple tokens -in a single forward pass and thus accelerate autoregressive decoding. -The success of speculative decoding largely hinges on the speculation -lookahead (SL), i.e. the number of tokens produced by the draft model in -each iteration. The straightforward method, based on `Leviathan et -al. `__, uses a static value of the -speculation lookahead and involves generating a constant number of -candidate tokens at each speculative iteration. You can adjust the -number of candidates using ``num_assistant_tokens`` parameter in -generation config. If the assistant model’s confidence in its prediction -for the current token is lower than this threshold, the assistant model -stops the current token generation iteration is not yet reached. - -.. |image02| image:: https://github.com/user-attachments/assets/69f5c096-abca-4f97-952b-291c52eb3444 - -.. code:: ipython3 - - scheduler_config = ov_genai.SchedulerConfig() - # cache params - scheduler_config.cache_size = 0 - scheduler_config.num_kv_blocks = 2048 // 8 - scheduler_config.max_num_batched_tokens = 2048 - - draft_model = ov_genai.draft_model(draft_model_path, device.value) - - pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config) - - config = ov_genai.GenerationConfig() - config.max_new_tokens = 330 - config.num_assistant_tokens = 5 - start_time = time.perf_counter() - result = pipe.generate(prompt, config, streamer=streamer) - end_time = time.perf_counter() - -.. code:: ipython3 - - print(f"Generation time: {end_time - start_time:.2f}s") - -Alternative approach, Dynamic Speculative Decoding, described in the -`paper `__ is based on heuristics and -adjusts the number of candidate tokens for the next iteration based on -the acceptance rate of the current iteration. If all speculative tokens -are correct, the number of candidate tokens increases; otherwise, it -decreases. For adjusting number of tokens -``assistant_confidence_threshold`` parameters should be used. If the -assistant model’s confidence in its prediction for the current token is -lower than this threshold, the assistant model stops the current token -generation iteration, even if the number of ``num_assistant_tokens`` is -not yet reached. You can find more details in this `blog -post `__. -This approach has advantages for cases, when optimal number of tokens -for draft model is unknown and draft model has low acceptance rate. - - *Note*: For small and fast draft models like FastDraft, you may not - see benefit for dynamic speculative decoding. - -.. code:: ipython3 - - config = ov_genai.GenerationConfig() - config.max_new_tokens = 100 - config.assistant_confidence_threshold = 0.05 - start_time = time.perf_counter() - result = pipe.generate(["Sun is yellow because"], config, streamer) - end_time = time.perf_counter() - -.. code:: ipython3 - - print(f"Generation time: {end_time - start_time:.2f}s") - -Evaluate Speculative Decoding on multiple examples --------------------------------------------------- - -Configure the data type and the number of examples to run: - -.. code:: ipython3 - - num_samples_to_select = 50 - - import ipywidgets as widgets - - data_options = ["Code", "Text"] - data_type = widgets.Dropdown( - options=data_options, - value=data_options[0], - description="Data type:", - disabled=False, - ) - data_type - - - - -.. parsed-literal:: - - Dropdown(description='Data type:', options=('Code', 'Text'), value='Code') - - - -Load the dataset and prepare the prompts: - -.. code:: ipython3 - - from datasets import load_dataset - - print("loading dataset...") - - if data_type.value == "Code": - ds = load_dataset("openai_humaneval", split="test") - prompts = ds["prompt"] - prompts = ["" + prompts[i] for i in range(num_samples_to_select)] - else: - ds = load_dataset("abisee/cnn_dailymail", "3.0.0", split="test") - prompts = ds["article"] - prompts = [ - "<|user|> ###\nArticle: " + prompts[i] + "\n\nSummarize the above article in 5 sentence.\n<|end|><|assistant|>" for i in range(num_samples_to_select) - ] - print("Done") - - -.. parsed-literal:: - - loading dataset... - Done - - -Run auto-regressive generation and get total runtime per example: - -.. code:: ipython3 - - import openvino_genai as ov_genai - import time - from tqdm import tqdm - - print("Running Auto-Regressive generation...") - pipe = ov_genai.LLMPipeline(target_model_path, device.value) - - config = ov_genai.GenerationConfig() - config.max_new_tokens = 330 - - times_auto_regressive = [] - for prompt in tqdm(prompts): - start_time = time.perf_counter() - result = pipe.generate(prompt, config) - end_time = time.perf_counter() - times_auto_regressive.append(end_time - start_time) - print("Done") - - import gc - - del pipe - gc.collect() - - -.. parsed-literal:: - - Running Auto-Regressive generation... - - -.. parsed-literal:: - - 100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:26<00:00, 5.32s/it] - -.. parsed-literal:: - - Done - - - - - - - - - -.. parsed-literal:: - - 9 - - - -Now run generation with speculative-decoding: - -.. code:: ipython3 - - scheduler_config = ov_genai.SchedulerConfig() - # cache params - scheduler_config.cache_size = 0 - scheduler_config.num_kv_blocks = 2048 // 8 - scheduler_config.max_num_batched_tokens = 2048 - - draft_model = ov_genai.draft_model(draft_model_path, device.value) - - pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config) - - config = ov_genai.GenerationConfig() - config.max_new_tokens = 330 - config.num_assistant_tokens = 5 - - - times_speculative_decoding = [] - print("Running Speculative Decoding generation...") - for prompt in tqdm(prompts): - start_time = time.perf_counter() - result = pipe.generate(prompt, config) - end_time = time.perf_counter() - times_speculative_decoding.append((end_time - start_time)) - print("Done") - - -.. parsed-literal:: - - Running Speculative Decoding generation... - - -.. parsed-literal:: - - 100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [01:52<00:00, 2.25s/it] - -.. parsed-literal:: - - Done - - - - - - - -Now let’s calculate the speedup: - -.. code:: ipython3 - - avg_speedup = sum([x / y for x, y in zip(times_auto_regressive, times_speculative_decoding)]) / len(prompts) - print(f"average speedup: {avg_speedup:.2f}") - - -.. parsed-literal:: - - average speedup: 2.23 - diff --git a/docs/notebooks/speech-recognition-quantization-wav2vec2-with-output.rst b/docs/notebooks/speech-recognition-quantization-wav2vec2-with-output.rst deleted file mode 100644 index 8a2b9a6e3f01aa..00000000000000 --- a/docs/notebooks/speech-recognition-quantization-wav2vec2-with-output.rst +++ /dev/null @@ -1,478 +0,0 @@ -Quantize Speech Recognition Models with accuracy control using NNCF PTQ API -=========================================================================== - -This tutorial demonstrates how to apply ``INT8`` quantization with -accuracy control to the speech recognition model, known as -`Wav2Vec2 `__, -using the NNCF (Neural Network Compression Framework) 8-bit quantization -with accuracy control in post-training mode (without the fine-tuning -pipeline). This notebook uses a fine-tuned -`Wav2Vec2-Base-960h `__ -`PyTorch `__ model trained on the `LibriSpeech ASR -corpus `__. The tutorial is designed to be -extendable to custom models and datasets. It consists of the following -steps: - -- Download and prepare the Wav2Vec2 model and LibriSpeech dataset. -- Define data loading and accuracy validation functionality. -- Model quantization with accuracy control. -- Compare Accuracy of original PyTorch model, OpenVINO FP16 and INT8 - models. -- Compare performance of the original and quantized models. - -The advanced quantization flow allows to apply 8-bit quantization to the -model with control of accuracy metric. This is achieved by keeping the -most impactful operations within the model in the original precision. -The flow is based on the `Basic 8-bit -quantization `__ -and has the following differences: - -- Besides the calibration dataset, a validation dataset is required to - compute the accuracy metric. Both datasets can refer to the same data - in the simplest case. -- Validation function, used to compute accuracy metric is required. It - can be a function that is already available in the source framework - or a custom function. -- Since accuracy validation is run several times during the - quantization process, quantization with accuracy control can take - more time than the Basic 8-bit quantization flow. -- The resulted model can provide smaller performance improvement than - the Basic 8-bit quantization flow because some of the operations are - kept in the original precision. - -.. - - **NOTE**: Currently, 8-bit quantization with accuracy control in NNCF - is available only for models in OpenVINO representation. - -The steps for the quantization with accuracy control are described -below. - - -**Table of contents:** - - -- `Imports <#imports>`__ -- `Prepare the Model <#prepare-the-model>`__ -- `Prepare LibriSpeech Dataset <#prepare-librispeech-dataset>`__ -- `Prepare calibration dataset <#prepare-calibration-dataset>`__ -- `Prepare validation function <#prepare-validation-function>`__ -- `Run quantization with accuracy - control <#run-quantization-with-accuracy-control>`__ -- `Model Usage Example <#model-usage-example>`__ -- `Compare Accuracy of the Original and Quantized - Models <#compare-accuracy-of-the-original-and-quantized-models>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - %pip install -q "openvino>=2023.1.0" - %pip install -q "nncf>=2.6.0" - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu soundfile librosa transformers "torch>=2.1" datasets torchmetrics - -Imports -------- - - - -.. code:: ipython3 - - import numpy as np - import torch - - from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("speech-recognition-quantization-wav2vec2.ipynb") - -Prepare the Model ------------------ - - - -For instantiating PyTorch model class, we should use -``Wav2Vec2ForCTC.from_pretrained`` method with providing model ID for -downloading from HuggingFace hub. Model weights and configuration files -will be downloaded automatically in first time usage. Keep in mind that -downloading the files can take several minutes and depends on your -internet connection. - -Additionally, we can create processor class which is responsible for -model specific pre- and post-processing steps. - -.. code:: ipython3 - - BATCH_SIZE = 1 - MAX_SEQ_LENGTH = 30480 - - - torch_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h", ctc_loss_reduction="mean") - processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") - -Convert it to the OpenVINO Intermediate Representation (OpenVINO IR) - -.. code:: ipython3 - - import openvino as ov - - - default_input = torch.zeros([1, MAX_SEQ_LENGTH], dtype=torch.float) - ov_model = ov.convert_model(torch_model, example_input=default_input) - -Prepare LibriSpeech Dataset ---------------------------- - - - -For demonstration purposes, we will use short dummy version of -LibriSpeech dataset - ``patrickvonplaten/librispeech_asr_dummy`` to -speed up model evaluation. Model accuracy can be different from reported -in the paper. For reproducing original accuracy, use -``openslr/librispeech_asr`` dataset. - -.. code:: ipython3 - - from datasets import load_dataset - - - dataset = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) - test_sample = dataset[0]["audio"] - - - # define preprocessing function for converting audio to input values for model - def map_to_input(batch): - preprocessed_signal = processor( - batch["audio"]["array"], - return_tensors="pt", - padding="longest", - sampling_rate=batch["audio"]["sampling_rate"], - ) - input_values = preprocessed_signal.input_values - batch["input_values"] = input_values - return batch - - - # apply preprocessing function to dataset and remove audio column, to save memory as we do not need it anymore - dataset = dataset.map(map_to_input, batched=False, remove_columns=["audio"]) - - -.. parsed-literal:: - - Found cached dataset librispeech_asr_dummy (/home/ea/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr_dummy/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc) - Loading cached processed dataset at /home/ea/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr_dummy/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc/cache-dcb48242e67b91b1.arrow - - -Prepare calibration dataset ---------------------------- - - - -.. code:: ipython3 - - import nncf - - - def transform_fn(data_item): - """ - Extract the model's input from the data item. - The data item here is the data item that is returned from the data source per iteration. - This function should be passed when the data item cannot be used as model's input. - """ - return np.array(data_item["input_values"]) - - - calibration_dataset = nncf.Dataset(dataset, transform_fn) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -Prepare validation function ---------------------------- - - - -Define the validation function. - -.. code:: ipython3 - - from torchmetrics import WordErrorRate - from tqdm.notebook import tqdm - - - def validation_fn(model, dataset): - """ - Calculate and returns a metric for the model. - """ - wer = WordErrorRate() - for sample in dataset: - # run infer function on sample - output = model.output(0) - logits = model(np.array(sample["input_values"]))[output] - predicted_ids = np.argmax(logits, axis=-1) - transcription = processor.batch_decode(torch.from_numpy(predicted_ids)) - - # update metric on sample result - wer.update(transcription, [sample["text"]]) - - result = wer.compute() - - return 1 - result - -Run quantization with accuracy control --------------------------------------- - - - -You should provide the calibration dataset and the validation dataset. -It can be the same dataset. - -- parameter ``max_drop`` defines the - accuracy drop threshold. The quantization process stops when the - degradation of accuracy metric on the validation dataset is less than - the ``max_drop``. The default value is 0.01. NNCF will stop the - quantization and report an error if the ``max_drop`` value can’t be - reached. -- ``drop_type`` defines how the accuracy drop will be - calculated: ABSOLUTE (used by default) or RELATIVE. -- ``ranking_subset_size`` - size of a subset that is used to rank layers - by their contribution to the accuracy drop. Default value is 300, and - the more samples it has the better ranking, potentially. Here we use the - value 25 to speed up the execution. - - **NOTE**: Execution can take tens of minutes and requires up to 10 GB - of free memory - -.. code:: ipython3 - - from nncf.quantization.advanced_parameters import AdvancedAccuracyRestorerParameters - from nncf.parameters import ModelType - - quantized_model = nncf.quantize_with_accuracy_control( - ov_model, - calibration_dataset=calibration_dataset, - validation_dataset=calibration_dataset, - validation_fn=validation_fn, - max_drop=0.01, - drop_type=nncf.DropType.ABSOLUTE, - model_type=ModelType.TRANSFORMER, - advanced_accuracy_restorer_parameters=AdvancedAccuracyRestorerParameters(ranking_subset_size=25), - ) - - -.. parsed-literal:: - - Statistics collection: 24%|███████████████████████████████████▎ | 73/300 [00:12<00:37, 5.98it/s] - Applying Smooth Quant: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 41.01it/s] - - -.. parsed-literal:: - - INFO:nncf:36 ignored nodes was found by name in the NNCFGraph - - -.. parsed-literal:: - - Statistics collection: 24%|███████████████████████████████████▎ | 73/300 [00:22<01:08, 3.31it/s] - Applying Fast Bias correction: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 74/74 [00:23<00:00, 3.09it/s] - -.. parsed-literal:: - - INFO:nncf:Validation of initial model was started - - - - - - - -.. parsed-literal:: - - INFO:nncf:Elapsed Time: 00:00:00 - INFO:nncf:Elapsed Time: 00:00:11 - INFO:nncf:Metric of initial model: 0.9469565153121948 - INFO:nncf:Collecting values for each data item using the initial model - INFO:nncf:Elapsed Time: 00:00:09 - INFO:nncf:Validation of quantized model was started - INFO:nncf:Elapsed Time: 00:00:22 - INFO:nncf:Elapsed Time: 00:00:11 - INFO:nncf:Metric of quantized model: 0.5 - INFO:nncf:Collecting values for each data item using the quantized model - INFO:nncf:Elapsed Time: 00:00:06 - INFO:nncf:Accuracy drop: 0.4469565153121948 (DropType.ABSOLUTE) - INFO:nncf:Accuracy drop: 0.4469565153121948 (DropType.ABSOLUTE) - INFO:nncf:Total number of quantized operations in the model: 94 - INFO:nncf:Number of parallel processes to rank quantized operations: 14 - INFO:nncf:ORIGINAL metric is used to rank quantizers - INFO:nncf:Calculating ranking score for groups of quantizers - INFO:nncf:Elapsed Time: 00:04:36 - INFO:nncf:Changing the scope of quantizer nodes was started - INFO:nncf:Reverted 1 operations to the floating-point precision: - __module.wav2vec2.feature_extractor.conv_layers.2.conv/aten::_convolution/Convolution_11 - INFO:nncf:Accuracy drop with the new quantization scope is 0.06173914670944214 (DropType.ABSOLUTE) - INFO:nncf:Reverted 1 operations to the floating-point precision: - __module.wav2vec2.feature_extractor.conv_layers.1.conv/aten::_convolution/Convolution_10 - INFO:nncf:Accuracy drop with the new quantization scope is 0.010434746742248535 (DropType.ABSOLUTE) - INFO:nncf:Reverted 1 operations to the floating-point precision: - __module.wav2vec2.feature_extractor.conv_layers.3.conv/aten::_convolution/Convolution_12 - INFO:nncf:Algorithm completed: achieved required accuracy drop 0.006956517696380615 (DropType.ABSOLUTE) - INFO:nncf:3 out of 94 were reverted back to the floating-point precision: - __module.wav2vec2.feature_extractor.conv_layers.2.conv/aten::_convolution/Convolution_11 - __module.wav2vec2.feature_extractor.conv_layers.1.conv/aten::_convolution/Convolution_10 - __module.wav2vec2.feature_extractor.conv_layers.3.conv/aten::_convolution/Convolution_12 - - -Model Usage Example -------------------- - - - -.. code:: ipython3 - - import IPython.display as ipd - - - ipd.Audio(test_sample["array"], rate=16000) - - - - -.. raw:: html - - - - - - - -Select device for inference - -.. code:: ipython3 - - from notebook_utils import device_widget - - core = ov.Core() - - device = device_widget("CPU") - - device - -.. code:: ipython3 - - compiled_quantized_model = core.compile_model(model=quantized_model, device_name=device.value) - - input_data = np.expand_dims(test_sample["array"], axis=0) - -Next, make a prediction. - -.. code:: ipython3 - - predictions = compiled_quantized_model([input_data])[0] - predicted_ids = np.argmax(predictions, axis=-1) - transcription = processor.batch_decode(torch.from_numpy(predicted_ids)) - transcription - - - - -.. parsed-literal:: - - ['I E O WE WORD I O O FAGGI FARE E BO'] - - - -Compare Accuracy of the Original and Quantized Models ------------------------------------------------------ - - - -- Define dataloader for test dataset. -- Define functions to get inference for PyTorch and OpenVINO models. -- Define functions to compute Word Error Rate. - -.. code:: ipython3 - - # inference function for pytorch - def torch_infer(model, sample): - logits = model(torch.Tensor(sample["input_values"])).logits - # take argmax and decode - predicted_ids = torch.argmax(logits, dim=-1) - transcription = processor.batch_decode(predicted_ids) - return transcription - - - # inference function for openvino - def ov_infer(model, sample): - output = model.output(0) - logits = model(np.array(sample["input_values"]))[output] - predicted_ids = np.argmax(logits, axis=-1) - transcription = processor.batch_decode(torch.from_numpy(predicted_ids)) - return transcription - - - def compute_wer(dataset, model, infer_fn): - wer = WordErrorRate() - for sample in tqdm(dataset): - # run infer function on sample - transcription = infer_fn(model, sample) - # update metric on sample result - wer.update(transcription, [sample["text"]]) - # finalize metric calculation - result = wer.compute() - return result - -Now, compute WER for the original PyTorch model and quantized model. - -.. code:: ipython3 - - pt_result = compute_wer(dataset, torch_model, torch_infer) - quantized_result = compute_wer(dataset, compiled_quantized_model, ov_infer) - - print(f"[PyTorch] Word Error Rate: {pt_result:.4f}") - print(f"[Quantized OpenVino] Word Error Rate: {quantized_result:.4f}") - - - -.. parsed-literal:: - - 0%| | 0/73 [00:00`__ is an -open-source PyTorch toolkit that accelerates Conversational AI -development, i.e., the technology behind speech assistants, chatbots, -and large language models. - -Lear more in `GitHub -repo `__ and -`paper `__ - -This notebook tutorial demonstrates optimization and inference of -speechbrain emotion recognition model with OpenVINO. - - -**Table of contents:** - - -- `Installations <#installations>`__ -- `Imports <#imports>`__ -- `Prepare base model <#prepare-base-model>`__ -- `Initialize model <#initialize-model>`__ -- `PyTorch inference <#pytorch-inference>`__ -- `SpeechBrain model optimization with Intel - OpenVINO <#speechbrain-model-optimization-with-intel-openvino>`__ - - - `Step 1: Prepare input tensor <#step-1-prepare-input-tensor>`__ - - `Step 2: Convert model to OpenVINO - IR <#step-2-convert-model-to-openvino-ir>`__ - - `Step 3: OpenVINO model - inference <#step-3-openvino-model-inference>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Installations -~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %pip install -q "speechbrain>=1.0.0" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q --upgrade --force-reinstall torch torchaudio --index-url https://download.pytorch.org/whl/cpu - %pip install -q "transformers>=4.30.0" "huggingface_hub>=0.8.0" "SoundFile" - %pip install -q "openvino>=2024.1.0" - -Imports -~~~~~~~ - - - -.. code:: ipython3 - - import torch - import torchaudio - from speechbrain.inference.interfaces import foreign_class - from huggingface_hub import hf_hub_download - - import openvino as ov - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("speechbrain-emotion-recognition.ipynb") - -Prepare base model -~~~~~~~~~~~~~~~~~~ - - - -The foreign_class function in SpeechBrain is a utility that allows you -to load and use custom PyTorch models within the SpeechBrain ecosystem. -It provides a convenient way to integrate external or custom-built -models into SpeechBrain’s inference pipeline without modifying the core -SpeechBrain codebase. - -1. source: This argument specifies the source or location of the - pre-trained model checkpoint. In this case, - “speechbrain/emotion-recognition-wav2vec2-IEMOCAP” refers to a - pre-trained model checkpoint available on the Hugging Face Hub. -2. pymodule_file: This argument is the path to a Python file containing - the definition of your custom PyTorch model class. In this example, - “custom_interface.py” is the name of the Python file that defines the - CustomEncoderWav2vec2Classifier class. -3. classname: This argument specifies the name of the custom PyTorch - model class defined in the pymodule_file. In this case, - “CustomEncoderWav2vec2Classifier” is the name of the class that - extends SpeechBrain’s Pretrained class and implements the necessary - methods for inference. - -.. code:: ipython3 - - classifier = foreign_class( - source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier" - ) - - -.. parsed-literal:: - - INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/emotion-recognition-wav2vec2-IEMOCAP' if not cached - INFO:speechbrain.utils.fetching:Fetch custom_interface.py: Fetching from HuggingFace Hub 'speechbrain/emotion-recognition-wav2vec2-IEMOCAP' if not cached - 2024-11-05 19:56:33.843411: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-05 19:56:33.855243: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered - WARNING: All log messages before absl::InitializeLog() is called are written to STDERR - E0000 00:00:1730822193.869707 99152 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered - E0000 00:00:1730822193.873697 99152 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered - 2024-11-05 19:56:33.888531: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - - - -.. parsed-literal:: - - config.json: 0%| | 0.00/1.84k [00:00`__ is an -open-source model optimized for generating short audio samples, sound -effects, and production elements using text prompts. The model was -trained on data from Freesound and the Free Music Archive, respecting -creator rights. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/76171391/ed4aa0f2-0501-4519-8b24-c1c3072b4ef2 - :alt: stable-audio - - stable-audio - -Key Takeaways: -^^^^^^^^^^^^^^ - -- Stable Audio Open is an open source text-to-audio model for - generating up to 47 seconds of samples and sound effects. - -- Users can create drum beats, instrument riffs, ambient sounds, foley - and production elements. - -- The model enables audio variations and style transfer of audio - samples. - -This model is made to be used with the -`stable-audio-tools `__ -library for inference. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Load the original model and - inference <#load-the-original-model-and-inference>`__ -- `Convert the model to OpenVINO - IR <#convert-the-model-to-openvino-ir>`__ - - - `T5-based text embedding <#t5-based-text-embedding>`__ - - `Transformer-based diffusion (DiT) - model <#transformer-based-diffusion-dit-model>`__ - - `Decoder part of autoencoder <#decoder-part-of-autoencoder>`__ - -- `Compiling models and inference <#compiling-models-and-inference>`__ -- `Interactive inference <#interactive-inference>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import platform - - %pip install -q "torch>=2.2" "torchaudio" "einops" "einops-exts" "huggingface-hub" "k-diffusion" "pytorch_lightning" "alias-free-torch" "ema-pytorch" "transformers>=4.45" "gradio>=4.19" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q --no-deps "stable-audio-tools" - %pip install -q "nncf>=2.12.0" - if platform.system() == "Darwin": - %pip install -q "numpy>=1.26,<2.0.0" "pandas>2.0.2" "matplotlib>=3.9" - else: - %pip install -q "numpy>=1.26" "pandas>2.0.2" "matplotlib>=3.9" - %pip install -q "openvino>=2024.4.0" - -Load the original model and inference -------------------------------------- - - - - **Note**: run model with notebook, you will need to accept license - agreement. You must be a registered user in Hugging Face Hub. - Please visit `HuggingFace model - card `__, - carefully read terms of usage and click accept button. You will need - to use an access token for the code below to run. For more - information on access tokens, refer to `this section of the - documentation `__. - You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: ipython3 - - # uncomment these lines to login to huggingfacehub to get access to pretrained model - # from huggingface_hub import notebook_login, whoami - - # try: - # whoami() - # print('Authorization token already provided') - # except OSError: - # notebook_login() - -.. code:: ipython3 - - import torch - import torchaudio - from einops import rearrange - from stable_audio_tools import get_pretrained_model - from stable_audio_tools.inference.generation import generate_diffusion_cond - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("stable-audio.ipynb") - - - # Download model - model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0") - - -.. parsed-literal:: - - No module named 'flash_attn' - flash_attn not installed, disabling Flash Attention - - -.. parsed-literal:: - - /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/myenv/lib/python3.11/site-packages/torch/nn/utils/weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. - WeightNorm.apply(module, name, dim) - - -.. code:: ipython3 - - sample_rate = model_config["sample_rate"] - - model = model.to("cpu") - total_seconds = 20 - - # Set up text and timing conditioning - conditioning = [{"prompt": "128 BPM tech house drum loop", "seconds_start": 0, "seconds_total": total_seconds}] - - # Generate stereo audio - output = generate_diffusion_cond( - model, - steps=100, - seed=42, - cfg_scale=7, - conditioning=conditioning, - sample_size=sample_rate * total_seconds, - sigma_min=0.3, - sigma_max=500, - sampler_type="dpmpp-3m-sde", - device="cpu", - ) - - # Rearrange audio batch to a single sequence - output = rearrange(output, "b d n -> d (b n)") - - # Peak normalize, clip, convert to int16, and save to file - output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu() - torchaudio.save("output.wav", output, sample_rate) - - -.. parsed-literal:: - - 42 - - -.. parsed-literal:: - - /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/myenv/lib/python3.11/site-packages/stable_audio_tools/models/conditioners.py:353: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. - with torch.cuda.amp.autocast(dtype=torch.float16) and torch.set_grad_enabled(self.enable_grad): - /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/myenv/lib/python3.11/site-packages/torch/amp/autocast_mode.py:266: UserWarning: User provided device_type of 'cuda', but CUDA is not available. Disabling - warnings.warn( - - - -.. parsed-literal:: - - 0%| | 0/100 [00:00=t0 but got ta=0.29999998211860657 and t0=0.3. - warnings.warn(f"Should have ta>=t0 but got ta={ta} and t0={self._start}.") - - -.. code:: ipython3 - - from IPython.display import Audio - - Audio("output.wav") - - - - -.. raw:: html - - - - - - - -Convert the model to OpenVINO IR --------------------------------- - - - -Let’s define the conversion function for PyTorch modules. We use -``ov.convert_model`` function to obtain OpenVINO Intermediate -Representation object and ``ov.save_model`` function to save it as XML -file. - -For reducing memory consumption, weights compression optimization can be -applied using `NNCF `__. Weight -compression aims to reduce the memory footprint of a model. models, -which require extensive memory to store the weights during inference, -can benefit from weight compression in the following ways: - -- enabling the inference of exceptionally large models that cannot be - accommodated in the memory of the device; - -- improving the inference performance of the models by reducing the - latency of the memory access when computing the operations with - weights, for example, Linear layers. - -`Neural Network Compression Framework -(NNCF) `__ provides 4-bit / -8-bit mixed weight quantization as a compression method. The main -difference between weights compression and full model quantization -(post-training quantization) is that activations remain floating-point -in the case of weights compression which leads to a better accuracy. In -addition, weight compression is data-free and does not require a -calibration dataset, making it easy to use. - -``nncf.compress_weights`` function can be used for performing weights -compression. The function accepts an OpenVINO model and other -compression parameters. Different parameters may be suitable for -different models. In this case default parameters give bad results. But -we can change mode to ``CompressWeightsMode.INT8_SYM`` to `compress -weights symmetrically to 8-bit integer data -type `__ -and get the inference results the same as original. - -More details about weights compression can be found in `OpenVINO -documentation `__. - -.. code:: ipython3 - - from pathlib import Path - import torch - from nncf import compress_weights, CompressWeightsMode - import openvino as ov - - - def convert(model: torch.nn.Module, xml_path: str, example_input): - xml_path = Path(xml_path) - if not xml_path.exists(): - xml_path.parent.mkdir(parents=True, exist_ok=True) - model.eval() - with torch.no_grad(): - converted_model = ov.convert_model(model, example_input=example_input) - converted_model = compress_weights(converted_model, mode=CompressWeightsMode.INT8_SYM) - ov.save_model(converted_model, xml_path) - - # cleanup memory - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, openvino - - -.. code:: ipython3 - - MODEL_DIR = Path("model") - - CONDITIONER_ENCODER_PATH = MODEL_DIR / "conditioner_encoder.xml" - DIFFUSION_PATH = MODEL_DIR / "diffusion.xml" - PRETRANSFORM_PATH = MODEL_DIR / "pretransform.xml" - -The pipeline comprises three components: an autoencoder that compresses -waveforms into a manageable sequence length, a T5-based text embedding -for text conditioning, and a transformer-based diffusion (DiT) model -that operates in the latent space of the autoencoder. In this example an -initial audio is not used, so we need to convert T5-based text embedding -model, transformer-based diffusion (DiT) model and only decoder part of -autoencoder. - -T5-based text embedding -~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - example_input = { - "input_ids": torch.zeros(1, 120, dtype=torch.int64), - "attention_mask": torch.zeros(1, 120, dtype=torch.int64), - } - - convert(model.conditioner.conditioners["prompt"].model, CONDITIONER_ENCODER_PATH, example_input) - -Transformer-based diffusion (DiT) model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - import types - - - def continious_transformer_forward(self, x, mask=None, prepend_embeds=None, prepend_mask=None, global_cond=None, return_info=False, **kwargs): - batch, seq, device = *x.shape[:2], x.device - - info = { - "hidden_states": [], - } - - x = self.project_in(x) - - if prepend_embeds is not None: - prepend_length, prepend_dim = prepend_embeds.shape[1:] - - assert prepend_dim == x.shape[-1], "prepend dimension must match sequence dimension" - - x = torch.cat((prepend_embeds, x), dim=-2) - - if prepend_mask is not None or mask is not None: - mask = mask if mask is not None else torch.ones((batch, seq), device=device, dtype=torch.bool) - prepend_mask = prepend_mask if prepend_mask is not None else torch.ones((batch, prepend_length), device=device, dtype=torch.bool) - - mask = torch.cat((prepend_mask, mask), dim=-1) - - # Attention layers - - if self.rotary_pos_emb is not None: - rotary_pos_emb = self.rotary_pos_emb.forward_from_seq_len(x.shape[1]) - else: - rotary_pos_emb = None - - if self.use_sinusoidal_emb or self.use_abs_pos_emb: - x = x + self.pos_emb(x) - - # Iterate over the transformer layers - for layer in self.layers: - x = layer(x, rotary_pos_emb=rotary_pos_emb, global_cond=global_cond, **kwargs) - if return_info: - info["hidden_states"].append(x) - - x = self.project_out(x) - - if return_info: - return x, info - - return x - - - class DiffusionWrapper(torch.nn.Module): - def __init__(self, diffusion): - super().__init__() - self.diffusion = diffusion - - def forward(self, x=None, t=None, cross_attn_cond=None, cross_attn_cond_mask=None, global_embed=None): - model_inputs = {"cross_attn_cond": cross_attn_cond, "cross_attn_cond_mask": cross_attn_cond_mask, "global_embed": global_embed} - - return self.diffusion.forward(x, t, cfg_scale=7, **model_inputs) - - - example_input = { - "x": torch.rand([1, 64, 1024], dtype=torch.float32), - "t": torch.rand([1], dtype=torch.float32), - "cross_attn_cond": torch.rand([1, 130, 768], dtype=torch.float32), - "cross_attn_cond_mask": torch.ones([1, 130], dtype=torch.float32), - "global_embed": torch.rand(torch.Size([1, 1536]), dtype=torch.float32), - } - - diffuser = model.model.model - diffuser.transformer.forward = types.MethodType(continious_transformer_forward, diffuser.transformer) - convert(DiffusionWrapper(diffuser), DIFFUSION_PATH, example_input) - -Decoder part of autoencoder -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - import types - - - def residual_forward(self, x): - res = x - x = self.layers(x) - return x + res - - - for layer in model.pretransform.model.decoder.layers: - if layer.__class__.__name__ == "DecoderBlock": - for sublayer in layer.layers: - if sublayer.__class__.__name__ == "ResidualUnit": - sublayer.forward = types.MethodType(residual_forward, sublayer) - - convert(model.pretransform.model.decoder, PRETRANSFORM_PATH, torch.rand([1, 64, 215], dtype=torch.float32)) - -Compiling models and inference ------------------------------- - - - -Select device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget("CPU") - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -Let’s create callable wrapper classes for compiled models to allow -interaction with original pipeline. Note that all of wrapper classes -return ``torch.Tensor``\ s instead of ``np.array``\ s. - -.. code:: ipython3 - - core = ov.Core() - - - class TextEncoderWrapper(torch.nn.Module): - def __init__(self, text_encoder, dtype, device="CPU"): - super().__init__() - self.text_encoder = core.compile_model(text_encoder, device) - self.dtype = dtype - - def __call__(self, input_ids=None, attention_mask=None): - inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - } - last_hidden_state = self.text_encoder(inputs)[0] - - return {"last_hidden_state": torch.from_numpy(last_hidden_state)} - -.. code:: ipython3 - - class OVWrapper(torch.nn.Module): - def __init__(self, ov_model, old_model, device="CPU") -> None: - super().__init__() - self.mock = torch.nn.Parameter(torch.zeros(1)) # this is only mock to not change the pipeline - self.dif_transformer = core.compile_model(ov_model, device) - - def forward(self, x=None, t=None, cross_attn_cond=None, cross_attn_cond_mask=None, global_embed=None, **kwargs): - inputs = { - "x": x, - "t": t, - "cross_attn_cond": cross_attn_cond, - "cross_attn_cond_mask": cross_attn_cond_mask, - "global_embed": global_embed, - } - result = self.dif_transformer(inputs) - - return torch.from_numpy(result[0]) - -.. code:: ipython3 - - class PretransformDecoderWrapper(torch.nn.Module): - def __init__(self, ov_model, device="CPU"): - super().__init__() - self.decoder = core.compile_model(ov_model, device) - - def forward(self, latents=None): - result = self.decoder(latents) - - return torch.from_numpy(result[0]) - -Now we can replace the original models by our wrapped OpenVINO models -and run inference. - -.. code:: ipython3 - - model.model.model = OVWrapper(DIFFUSION_PATH, model.model.model, device.value) - model.conditioner.conditioners["prompt"].model = TextEncoderWrapper( - CONDITIONER_ENCODER_PATH, model.conditioner.conditioners["prompt"].model.dtype, device.value - ) - model.pretransform.model.decoder = PretransformDecoderWrapper(PRETRANSFORM_PATH, device.value) - -.. code:: ipython3 - - output = generate_diffusion_cond( - model, - steps=100, - seed=42, - cfg_scale=7, - conditioning=conditioning, - sample_size=sample_rate * total_seconds, - sigma_min=0.3, - sigma_max=500, - sampler_type="dpmpp-3m-sde", - device="cpu", - ) - - # Rearrange audio batch to a single sequence - output = rearrange(output, "b d n -> d (b n)") - - # Peak normalize, clip, convert to int16, and save to file - output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu() - torchaudio.save("output.wav", output, sample_rate) - - -.. parsed-literal:: - - 42 - - - -.. parsed-literal:: - - 0%| | 0/100 [00:00 - - Your browser does not support the audio element. - - - - - -Interactive inference ---------------------- - - - -.. code:: ipython3 - - def _generate(prompt, total_seconds, steps, seed): - sample_rate = model_config["sample_rate"] - - # Set up text and timing conditioning - conditioning = [{"prompt": prompt, "seconds_start": 0, "seconds_total": total_seconds}] - - output = generate_diffusion_cond( - model, - steps=steps, - seed=seed, - cfg_scale=7, - conditioning=conditioning, - sample_size=sample_rate * total_seconds, - sigma_min=0.3, - sigma_max=500, - sampler_type="dpmpp-3m-sde", - device="cpu", - ) - - # Rearrange audio batch to a single sequence - output = rearrange(output, "b d n -> d (b n)") - - # Peak normalize, clip, convert to int16, and save to file - output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu() - return (sample_rate, output.numpy().transpose()) - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/stable-audio/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=_generate) - - try: - demo.launch(debug=True) - except Exception: - demo.launch(share=True, debug=True) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ diff --git a/docs/notebooks/stable-cascade-image-generation-with-output.rst b/docs/notebooks/stable-cascade-image-generation-with-output.rst deleted file mode 100644 index 02d80d97bd0940..00000000000000 --- a/docs/notebooks/stable-cascade-image-generation-with-output.rst +++ /dev/null @@ -1,551 +0,0 @@ -Image generation with Stable Cascade and OpenVINO -================================================= - -`Stable Cascade `__ -is built upon the -`Würstchen `__ architecture -and its main difference to other models like Stable Diffusion is that it -is working at a much smaller latent space. Why is this important? The -smaller the latent space, the faster you can run inference and the -cheaper the training becomes. How small is the latent space? Stable -Diffusion uses a compression factor of 8, resulting in a 1024x1024 image -being encoded to 128x128. Stable Cascade achieves a compression factor -of 42, meaning that it is possible to encode a 1024x1024 image to 24x24, -while maintaining crisp reconstructions. The text-conditional model is -then trained in the highly compressed latent space. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Load the original model <#load-the-original-model>`__ - - - `Infer the original model <#infer-the-original-model>`__ - -- `Convert the model to OpenVINO - IR <#convert-the-model-to-openvino-ir>`__ - - - `Prior pipeline <#prior-pipeline>`__ - - `Decoder pipeline <#decoder-pipeline>`__ - -- `Select inference device <#select-inference-device>`__ -- `Building the pipeline <#building-the-pipeline>`__ -- `Inference <#inference>`__ -- `Interactive inference <#interactive-inference>`__ - - - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "diffusers>=0.27.0" accelerate datasets gradio transformers "nncf>=2.10.0" "protobuf>=3.20" "openvino>=2024.1.0" "torch>=2.1" --extra-index-url https://download.pytorch.org/whl/cpu - -Load and run the original pipeline ----------------------------------- - - - -.. code:: ipython3 - - import torch - from diffusers import StableCascadeDecoderPipeline, StableCascadePriorPipeline - import requests - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("stable-cascade-image-generation.ipynb") - - prompt = "an image of a shiba inu, donning a spacesuit and helmet" - negative_prompt = "" - - prior = StableCascadePriorPipeline.from_pretrained("stabilityai/stable-cascade-prior", torch_dtype=torch.float32) - decoder = StableCascadeDecoderPipeline.from_pretrained("stabilityai/stable-cascade", torch_dtype=torch.float32) - - - -.. parsed-literal:: - - Loading pipeline components...: 0%| | 0/6 [00:00`__ -to 8-bit to reduce model size. - -.. code:: ipython3 - - import gc - from pathlib import Path - - import openvino as ov - import nncf - - - MODELS_DIR = Path("models") - - - def convert(model: torch.nn.Module, xml_path: str, example_input, input_shape=None): - xml_path = Path(xml_path) - if not xml_path.exists(): - model.eval() - xml_path.parent.mkdir(parents=True, exist_ok=True) - with torch.no_grad(): - if not input_shape: - converted_model = ov.convert_model(model, example_input=example_input) - else: - converted_model = ov.convert_model(model, example_input=example_input, input=input_shape) - converted_model = nncf.compress_weights(converted_model) - ov.save_model(converted_model, xml_path) - del converted_model - - # cleanup memory - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - - gc.collect() - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, openvino - - -Prior pipeline -~~~~~~~~~~~~~~ - - - -This pipeline consists of text encoder and prior diffusion model. From -here, we always use fixed shapes in conversion by using an -``input_shape`` parameter to generate a less memory-demanding model. - -.. code:: ipython3 - - PRIOR_TEXT_ENCODER_OV_PATH = MODELS_DIR / "prior_text_encoder_model.xml" - - prior.text_encoder.config.output_hidden_states = True - - - class TextEncoderWrapper(torch.nn.Module): - def __init__(self, text_encoder): - super().__init__() - self.text_encoder = text_encoder - - def forward(self, input_ids, attention_mask): - outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True) - return outputs["text_embeds"], outputs["last_hidden_state"], outputs["hidden_states"] - - - convert( - TextEncoderWrapper(prior.text_encoder), - PRIOR_TEXT_ENCODER_OV_PATH, - example_input={ - "input_ids": torch.zeros(1, 77, dtype=torch.int32), - "attention_mask": torch.zeros(1, 77), - }, - input_shape={"input_ids": ((1, 77),), "attention_mask": ((1, 77),)}, - ) - del prior.text_encoder - gc.collect(); - -.. code:: ipython3 - - PRIOR_PRIOR_MODEL_OV_PATH = MODELS_DIR / "prior_prior_model.xml" - - convert( - prior.prior, - PRIOR_PRIOR_MODEL_OV_PATH, - example_input={ - "sample": torch.zeros(2, 16, 24, 24), - "timestep_ratio": torch.ones(2), - "clip_text_pooled": torch.zeros(2, 1, 1280), - "clip_text": torch.zeros(2, 77, 1280), - "clip_img": torch.zeros(2, 1, 768), - }, - input_shape=[((-1, 16, 24, 24),), ((-1),), ((-1, 1, 1280),), ((-1, 77, 1280),), (-1, 1, 768)], - ) - del prior.prior - gc.collect(); - -Decoder pipeline -~~~~~~~~~~~~~~~~ - - - -Decoder pipeline consists of 3 parts: decoder, text encoder and VQGAN. - -.. code:: ipython3 - - DECODER_TEXT_ENCODER_MODEL_OV_PATH = MODELS_DIR / "decoder_text_encoder_model.xml" - - convert( - TextEncoderWrapper(decoder.text_encoder), - DECODER_TEXT_ENCODER_MODEL_OV_PATH, - example_input={ - "input_ids": torch.zeros(1, 77, dtype=torch.int32), - "attention_mask": torch.zeros(1, 77), - }, - input_shape={"input_ids": ((1, 77),), "attention_mask": ((1, 77),)}, - ) - - del decoder.text_encoder - gc.collect(); - -.. code:: ipython3 - - DECODER_DECODER_MODEL_OV_PATH = MODELS_DIR / "decoder_decoder_model.xml" - - convert( - decoder.decoder, - DECODER_DECODER_MODEL_OV_PATH, - example_input={ - "sample": torch.zeros(1, 4, 256, 256), - "timestep_ratio": torch.ones(1), - "clip_text_pooled": torch.zeros(1, 1, 1280), - "effnet": torch.zeros(1, 16, 24, 24), - }, - input_shape=[((-1, 4, 256, 256),), ((-1),), ((-1, 1, 1280),), ((-1, 16, 24, 24),)], - ) - del decoder.decoder - gc.collect(); - -.. code:: ipython3 - - VQGAN_PATH = MODELS_DIR / "vqgan_model.xml" - - - class VqganDecoderWrapper(torch.nn.Module): - def __init__(self, vqgan): - super().__init__() - self.vqgan = vqgan - - def forward(self, h): - return self.vqgan.decode(h) - - - convert( - VqganDecoderWrapper(decoder.vqgan), - VQGAN_PATH, - example_input=torch.zeros(1, 4, 256, 256), - input_shape=(1, 4, 256, 256), - ) - del decoder.vqgan - gc.collect(); - -Select inference device ------------------------ - - - -Select device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=4, options=('CPU', 'GPU.0', 'GPU.1', 'GPU.2', 'AUTO'), value='AUTO') - - - -Building the pipeline ---------------------- - - - -Let’s create callable wrapper classes for compiled models to allow -interaction with original pipelines. Note that all of wrapper classes -return ``torch.Tensor``\ s instead of ``np.array``\ s. - -.. code:: ipython3 - - from collections import namedtuple - - core = ov.Core() - - - BaseModelOutputWithPooling = namedtuple("BaseModelOutputWithPooling", ["text_embeds", "last_hidden_state", "hidden_states"]) - - - class TextEncoderWrapper: - dtype = torch.float32 # accessed in the original workflow - - def __init__(self, text_encoder_path, device): - self.text_encoder = core.compile_model(text_encoder_path, device.value) - - def __call__(self, input_ids, attention_mask, output_hidden_states=True): - output = self.text_encoder({"input_ids": input_ids, "attention_mask": attention_mask}) - text_embeds = output[0] - last_hidden_state = output[1] - hidden_states = list(output.values())[1:] - return BaseModelOutputWithPooling(torch.from_numpy(text_embeds), torch.from_numpy(last_hidden_state), [torch.from_numpy(hs) for hs in hidden_states]) - -.. code:: ipython3 - - class PriorPriorWrapper: - def __init__(self, prior_path, device): - self.prior = core.compile_model(prior_path, device.value) - self.config = namedtuple("PriorWrapperConfig", ["clip_image_in_channels", "in_channels"])(768, 16) # accessed in the original workflow - self.parameters = lambda: (torch.zeros(i, dtype=torch.float32) for i in range(1)) # accessed in the original workflow - - def __call__(self, sample, timestep_ratio, clip_text_pooled, clip_text=None, clip_img=None, **kwargs): - inputs = { - "sample": sample, - "timestep_ratio": timestep_ratio, - "clip_text_pooled": clip_text_pooled, - "clip_text": clip_text, - "clip_img": clip_img, - } - output = self.prior(inputs) - return [torch.from_numpy(output[0])] - -.. code:: ipython3 - - class DecoderWrapper: - dtype = torch.float32 # accessed in the original workflow - - def __init__(self, decoder_path, device): - self.decoder = core.compile_model(decoder_path, device.value) - - def __call__(self, sample, timestep_ratio, clip_text_pooled, effnet, **kwargs): - inputs = {"sample": sample, "timestep_ratio": timestep_ratio, "clip_text_pooled": clip_text_pooled, "effnet": effnet} - output = self.decoder(inputs) - return [torch.from_numpy(output[0])] - -.. code:: ipython3 - - VqganOutput = namedtuple("VqganOutput", "sample") - - - class VqganWrapper: - config = namedtuple("VqganWrapperConfig", "scale_factor")(0.3764) # accessed in the original workflow - - def __init__(self, vqgan_path, device): - self.vqgan = core.compile_model(vqgan_path, device.value) - - def decode(self, h): - output = self.vqgan(h)[0] - output = torch.tensor(output) - return VqganOutput(output) - -And insert wrappers instances in the pipeline: - -.. code:: ipython3 - - prior.text_encoder = TextEncoderWrapper(PRIOR_TEXT_ENCODER_OV_PATH, device) - prior.prior = PriorPriorWrapper(PRIOR_PRIOR_MODEL_OV_PATH, device) - decoder.decoder = DecoderWrapper(DECODER_DECODER_MODEL_OV_PATH, device) - decoder.text_encoder = TextEncoderWrapper(DECODER_TEXT_ENCODER_MODEL_OV_PATH, device) - decoder.vqgan = VqganWrapper(VQGAN_PATH, device) - -Inference ---------- - - - -.. code:: ipython3 - - prior_output = prior( - prompt=prompt, - height=1024, - width=1024, - negative_prompt=negative_prompt, - guidance_scale=4.0, - num_images_per_prompt=1, - num_inference_steps=20, - ) - - decoder_output = decoder( - image_embeddings=prior_output.image_embeddings, - prompt=prompt, - negative_prompt=negative_prompt, - guidance_scale=0.0, - output_type="pil", - num_inference_steps=10, - ).images[0] - display(decoder_output) - - - -.. parsed-literal:: - - 0%| | 0/20 [00:00`__ is an effective and -lightweight adapter that adds image prompting capabilities to a -diffusion model. This adapter works by decoupling the cross-attention -layers of the image and text features. All the other model components -are frozen and only the embedded image features in the UNet are trained. -As a result, IP-Adapter files are typically only ~100MBs. -|ip-adapter-pipe.png| - -In this tutorial, we will consider how to convert and run Stable -Diffusion pipeline with loading IP-Adapter. We will use -`stable-diffusion-v1.5 `__ -as base model and apply official -`IP-Adapter `__ weights. Also for -speedup generation process we will use -`LCM-LoRA `__ - -.. |ip-adapter-pipe.png| image:: https://huggingface.co/h94/IP-Adapter/resolve/main/fig1.png - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Prepare Diffusers pipeline <#prepare-diffusers-pipeline>`__ -- `Convert PyTorch models <#convert-pytorch-models>`__ - - - `Image Encoder <#image-encoder>`__ - - `U-net <#u-net>`__ - - `VAE Encoder and Decoder <#vae-encoder-and-decoder>`__ - - `Text Encoder <#text-encoder>`__ - -- `Prepare OpenVINO inference - pipeline <#prepare-openvino-inference-pipeline>`__ -- `Run model inference <#run-model-inference>`__ - - - `Select inference device <#select-inference-device>`__ - - `Generation image variation <#generation-image-variation>`__ - - `Generation conditioned by image and - text <#generation-conditioned-by-image-and-text>`__ - - `Generation image blending <#generation-image-blending>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "torch>=2.1" transformers accelerate "diffusers>=0.24.0" "openvino>=2023.3.0" "gradio>=4.19" opencv-python "peft>=0.6.2" "protobuf>=3.20" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "matplotlib>=3.4" - -Prepare Diffusers pipeline --------------------------- - - - -First of all, we should collect all components of our pipeline together. -To work with Stable Diffusion, we will use HuggingFace -`Diffusers `__ library. To -experiment with Stable Diffusion models, Diffusers exposes the -`StableDiffusionPipeline `__ -similar to the `other Diffusers -pipelines `__. -Additionally, the pipeline supports load adapters that extend Stable -Diffusion functionality such as `Low-Rank Adaptation -(LoRA) `__, -`PEFT `__, -`IP-Adapter `__, and `Textual -Inversion `__. You can find more -information about supported adapters in `diffusers -documentation `__. - -In this tutorial, we will focus on ip-adapter. IP-Adapter can be -integrated into diffusion pipeline using ``load_ip_adapter`` method. -IP-Adapter allows you to use both image and text to condition the image -generation process. For adjusting the text prompt and image prompt -condition ratio, we can use ``set_ip_adapter_scale()`` method. If you -only use the image prompt, you should set the scale to 1.0. You can -lower the scale to get more generation diversity, but it’ll be less -aligned with the prompt. scale=0.5 can achieve good results when you use -both text and image prompts. - -As discussed before, we will also use LCM LoRA for speeding generation -process. You can find more information about LCM LoRA in this -`notebook `__. -For applying LCM LoRA, we should use ``load_lora_weights`` method. -Additionally, LCM requires using LCMScheduler for efficient generation. - -.. code:: ipython3 - - import requests - from pathlib import Path - from diffusers import AutoPipelineForText2Image - from transformers import CLIPVisionModelWithProjection - from diffusers.utils import load_image - from diffusers import LCMScheduler - - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("stable-diffusion-ip-adapter.ipynb") - - - stable_diffusion_id = "botp/stable-diffusion-v1-5" - ip_adapter_id = "h94/IP-Adapter" - ip_adapter_weight_name = "ip-adapter_sd15.bin" - lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5" - models_dir = Path("model") - - load_original_pipeline = not all( - [ - (models_dir / model_name).exists() - for model_name in [ - "text_encoder.xml", - "image_encoder.xml", - "unet.xml", - "vae_decoder.xml", - "vae_encoder.xml", - ] - ] - ) - - - def get_pipeline_components( - stable_diffusion_id, - ip_adapter_id, - ip_adapter_weight_name, - lcm_lora_id, - ip_adapter_scale=0.6, - ): - image_encoder = CLIPVisionModelWithProjection.from_pretrained("h94/IP-Adapter", subfolder="models/image_encoder") - pipeline = AutoPipelineForText2Image.from_pretrained(stable_diffusion_id, image_encoder=image_encoder) - pipeline.load_lora_weights(lcm_lora_id) - pipeline.fuse_lora() - pipeline.load_ip_adapter(ip_adapter_id, subfolder="models", weight_name=ip_adapter_weight_name) - pipeline.set_ip_adapter_scale(0.6) - scheduler = LCMScheduler.from_pretrained(stable_diffusion_id, subfolder="scheduler") - return ( - pipeline.tokenizer, - pipeline.feature_extractor, - scheduler, - pipeline.text_encoder, - pipeline.image_encoder, - pipeline.unet, - pipeline.vae, - ) - - - if load_original_pipeline: - ( - tokenizer, - feature_extractor, - scheduler, - text_encoder, - image_encoder, - unet, - vae, - ) = get_pipeline_components(stable_diffusion_id, ip_adapter_id, ip_adapter_weight_name, lcm_lora_id) - scheduler.save_pretrained(models_dir / "scheduler") - else: - tokenizer, feature_extractor, scheduler, text_encoder, image_encoder, unet, vae = ( - None, - None, - None, - None, - None, - None, - None, - ) - - - -.. parsed-literal:: - - Loading pipeline components...: 0%| | 0/7 [00:00`__ -is used as Image Encoder. For preprocessing input image, Image Encoder -uses ``CLIPImageProcessor`` named feature extractor in pipeline. The -image encoder accept resized and normalized image processed by feature -extractor as input and returns image embeddings. - -.. code:: ipython3 - - import openvino as ov - import torch - import gc - - - def cleanup_torchscript_cache(): - """ - Helper for removing cached model representation - """ - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - - - IMAGE_ENCODER_PATH = models_dir / "image_encoder.xml" - - if not IMAGE_ENCODER_PATH.exists(): - with torch.no_grad(): - ov_model = ov.convert_model( - image_encoder, - example_input=torch.zeros((1, 3, 224, 224)), - input=[-1, 3, 224, 224], - ) - ov.save_model(ov_model, IMAGE_ENCODER_PATH) - feature_extractor.save_pretrained(models_dir / "feature_extractor") - del ov_model - cleanup_torchscript_cache() - - del image_encoder - del feature_extractor - - gc.collect(); - -U-net -~~~~~ - - - -U-Net model gradually denoises latent image representation guided by -text encoder hidden state. - -Generally, U-Net model conversion process remain the same like in Stable -Diffusion, expect additional input that accept image embeddings -generated by Image Encoder. In Stable Diffusion pipeline, this data -provided into model using dictionary ``added_cond_kwargs`` and key -``image_embeds`` inside it. After OpenVINO conversion, this input will -be decomposed from dictionary. In some cases, such decomposition may -lead to loosing information about input shape and data type. We can -restore it manually as demonstrated in the code bellow. - -U-Net model inputs: - -- ``sample`` - latent image sample from previous step. Generation - process has not been started yet, so you will use random noise. -- ``timestep`` - current scheduler step. -- ``encoder_hidden_state`` - hidden state of text encoder. -- ``image_embeds`` - hidden state of image encoder. - -Model predicts the ``sample`` state for the next step. - -.. code:: ipython3 - - UNET_PATH = models_dir / "unet.xml" - - - if not UNET_PATH.exists(): - inputs = { - "sample": torch.randn((2, 4, 64, 64)), - "timestep": torch.tensor(1), - "encoder_hidden_states": torch.randn((2, 77, 768)), - "added_cond_kwargs": {"image_embeds": torch.ones((2, 1024))}, - } - - with torch.no_grad(): - ov_model = ov.convert_model(unet, example_input=inputs) - # dictionary with added_cond_kwargs will be decomposed during conversion - # in some cases decomposition may lead to losing data type and shape information - # We need to recover it manually after the conversion - ov_model.inputs[-1].get_node().set_element_type(ov.Type.f32) - ov_model.validate_nodes_and_infer_types() - ov.save_model(ov_model, UNET_PATH) - del ov_model - cleanup_torchscript_cache() - - del unet - - gc.collect(); - -VAE Encoder and Decoder -~~~~~~~~~~~~~~~~~~~~~~~ - - - -The VAE model has two parts, an encoder and a decoder. The encoder is -used to convert the image into a low dimensional latent representation, -which will serve as the input to the U-Net model. The decoder, -conversely, transforms the latent representation back into an image. - -During latent diffusion training, the encoder is used to get the latent -representations (latents) of the images for the forward diffusion -process, which applies more and more noise at each step. During -inference, the denoised latents generated by the reverse diffusion -process are converted back into images using the VAE decoder. When you -run inference for Text-to-Image, there is no initial image as a starting -point. You can skip this step and directly generate initial random -noise. VAE encoder is used in Image-to-Image generation pipelines for -creating initial latent state based on input image. The main difference -between IP-Adapter encoded image and VAE encoded image that the first is -used as addition into input prompt making connection between text and -image during conditioning, while the second used as Unet sample -initialization and does not give guarantee preserving some attributes of -initial image. It is still can be useful to use both ip-adapter and VAE -image in pipeline, we can discuss it in inference examples. - -.. code:: ipython3 - - VAE_DECODER_PATH = models_dir / "vae_decoder.xml" - VAE_ENCODER_PATH = models_dir / "vae_encoder.xml" - - if not VAE_DECODER_PATH.exists(): - - class VAEDecoderWrapper(torch.nn.Module): - def __init__(self, vae): - super().__init__() - self.vae = vae - - def forward(self, latents): - return self.vae.decode(latents) - - vae_decoder = VAEDecoderWrapper(vae) - with torch.no_grad(): - ov_model = ov.convert_model(vae_decoder, example_input=torch.ones([1, 4, 64, 64])) - ov.save_model(ov_model, VAE_DECODER_PATH) - del ov_model - cleanup_torchscript_cache() - del vae_decoder - - if not VAE_ENCODER_PATH.exists(): - - class VAEEncoderWrapper(torch.nn.Module): - def __init__(self, vae): - super().__init__() - self.vae = vae - - def forward(self, image): - return self.vae.encode(x=image)["latent_dist"].sample() - - vae_encoder = VAEEncoderWrapper(vae) - vae_encoder.eval() - image = torch.zeros((1, 3, 512, 512)) - with torch.no_grad(): - ov_model = ov.convert_model(vae_encoder, example_input=image) - ov.save_model(ov_model, VAE_ENCODER_PATH) - del ov_model - cleanup_torchscript_cache() - - del vae - gc.collect(); - -Text Encoder -~~~~~~~~~~~~ - - - -The text-encoder is responsible for transforming the input prompt, for -example, “a photo of an astronaut riding a horse” into an embedding -space that can be understood by the U-Net. It is usually a simple -transformer-based encoder that maps a sequence of input tokens to a -sequence of latent text embeddings. - -The input of the text encoder is tensor ``input_ids``, which contains -indexes of tokens from text processed by the tokenizer and padded to the -maximum length accepted by the model. Model outputs are two tensors: -``last_hidden_state`` - hidden state from the last MultiHeadAttention -layer in the model and ``pooler_out`` - pooled output for whole model -hidden states. - -.. code:: ipython3 - - TEXT_ENCODER_PATH = models_dir / "text_encoder.xml" - - if not TEXT_ENCODER_PATH.exists(): - with torch.no_grad(): - ov_model = ov.convert_model( - text_encoder, - example_input=torch.ones([1, 77], dtype=torch.long), - input=[ - (1, 77), - ], - ) - ov.save_model(ov_model, TEXT_ENCODER_PATH) - del ov_model - cleanup_torchscript_cache() - tokenizer.save_pretrained(models_dir / "tokenizer") - - del text_encoder - del tokenizer - -Prepare OpenVINO inference pipeline ------------------------------------ - - - -As shown on diagram below, the only difference between original Stable -Diffusion pipeline and IP-Adapter Stable Diffusion pipeline only in -additional conditioning by image processed via Image Encoder. -|pipeline.png| - -The stable diffusion model with ip-adapter takes a latent image -representation, a text prompt is transformed to text embeddings via CLIP -text encoder and ip-adapter image is transformed to image embeddings via -CLIP Image Encoder. Next, the U-Net iteratively *denoises* the random -latent image representations while being conditioned on the text and -image embeddings. The output of the U-Net, being the noise residual, is -used to compute a denoised latent image representation via a scheduler -algorithm. - -The *denoising* process is repeated given number of times (by default 4 -taking into account that we use LCM) to step-by-step retrieve better -latent image representations. When complete, the latent image -representation is decoded by the decoder part of the variational auto -encoder (VAE). - -.. |pipeline.png| image:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/1afc2ca6-e7ea-4c9e-a2d3-1173346dd9d6 - -.. code:: ipython3 - - import inspect - from typing import List, Optional, Union, Dict, Tuple - import numpy as np - - import PIL - import cv2 - import torch - - from transformers import CLIPTokenizer, CLIPImageProcessor - from diffusers import DiffusionPipeline - from diffusers.pipelines.stable_diffusion.pipeline_output import ( - StableDiffusionPipelineOutput, - ) - from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler - - - def scale_fit_to_window(dst_width: int, dst_height: int, image_width: int, image_height: int): - """ - Preprocessing helper function for calculating image size for resize with peserving original aspect ratio - and fitting image to specific window size - - Parameters: - dst_width (int): destination window width - dst_height (int): destination window height - image_width (int): source image width - image_height (int): source image height - Returns: - result_width (int): calculated width for resize - result_height (int): calculated height for resize - """ - im_scale = min(dst_height / image_height, dst_width / image_width) - return int(im_scale * image_width), int(im_scale * image_height) - - - def randn_tensor( - shape: Union[Tuple, List], - generator: Optional[Union[List["torch.Generator"], "torch.Generator"]] = None, - dtype: Optional["torch.dtype"] = None, - ): - """A helper function to create random tensors on the desired `device` with the desired `dtype`. When - passing a list of generators, you can seed each batch size individually. - - """ - batch_size = shape[0] - rand_device = torch.device("cpu") - - # make sure generator list of length 1 is treated like a non-list - if isinstance(generator, list) and len(generator) == 1: - generator = generator[0] - - if isinstance(generator, list): - shape = (1,) + shape[1:] - latents = [torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype) for i in range(batch_size)] - latents = torch.cat(latents, dim=0) - else: - latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype) - - return latents - - - def preprocess(image: PIL.Image.Image, height, width): - """ - Image preprocessing function. Takes image in PIL.Image format, resizes it to keep aspect ration and fits to model input window 512x512, - then converts it to np.ndarray and adds padding with zeros on right or bottom side of image (depends from aspect ratio), after that - converts data to float32 data type and change range of values from [0, 255] to [-1, 1], finally, converts data layout from planar NHWC to NCHW. - The function returns preprocessed input tensor and padding size, which can be used in postprocessing. - - Parameters: - image (PIL.Image.Image): input image - Returns: - image (np.ndarray): preprocessed image tensor - meta (Dict): dictionary with preprocessing metadata info - """ - src_width, src_height = image.size - dst_width, dst_height = scale_fit_to_window(height, width, src_width, src_height) - image = np.array(image.resize((dst_width, dst_height), resample=PIL.Image.Resampling.LANCZOS))[None, :] - pad_width = width - dst_width - pad_height = height - dst_height - pad = ((0, 0), (0, pad_height), (0, pad_width), (0, 0)) - image = np.pad(image, pad, mode="constant") - image = image.astype(np.float32) / 255.0 - image = 2.0 * image - 1.0 - image = image.transpose(0, 3, 1, 2) - return image, {"padding": pad, "src_width": src_width, "src_height": src_height} - - - class OVStableDiffusionPipeline(DiffusionPipeline): - def __init__( - self, - vae_decoder: ov.Model, - text_encoder: ov.Model, - tokenizer: CLIPTokenizer, - unet: ov.Model, - scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], - image_encoder: ov.Model, - feature_extractor: CLIPImageProcessor, - vae_encoder: ov.Model, - ): - """ - Pipeline for text-to-image generation using Stable Diffusion and IP-Adapter with OpenVINO - Parameters: - vae_decoder (ov.Model): - Variational Auto-Encoder (VAE) Model to decode images to and from latent representations. - text_encoder (ov.Model):CLIPImageProcessor - Frozen text-encoder. Stable Diffusion uses the text portion of - [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically - the clip-vit-large-patch14(https://huggingface.co/openai/clip-vit-large-patch14) variant. - tokenizer (CLIPTokenizer): - Tokenizer of class CLIPTokenizer(https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet (ov.Model): Conditional U-Net architecture to denoise the encoded image latents. - scheduler (SchedulerMixin): - A scheduler to be used in combination with unet to denoise the encoded image latents - image_encoder (ov.Model): - IP-Adapter image encoder for embedding input image as input prompt for generation - feature_extractor : - """ - super().__init__() - self.scheduler = scheduler - self.vae_decoder = vae_decoder - self.image_encoder = image_encoder - self.text_encoder = text_encoder - self.unet = unet - self.height = 512 - self.width = 512 - self.vae_scale_factor = 8 - self.tokenizer = tokenizer - self.vae_encoder = vae_encoder - self.feature_extractor = feature_extractor - - def __call__( - self, - prompt: Union[str, List[str]], - ip_adapter_image: PIL.Image.Image, - image: PIL.Image.Image = None, - num_inference_steps: Optional[int] = 4, - negative_prompt: Union[str, List[str]] = None, - guidance_scale: Optional[float] = 0.5, - eta: Optional[float] = 0.0, - output_type: Optional[str] = "pil", - height: Optional[int] = None, - width: Optional[int] = None, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - strength: float = 1.0, - **kwargs, - ): - """ - Function invoked when calling the pipeline for generation. - Parameters: - prompt (str or List[str]): - The prompt or prompts to guide the image generation. - image (PIL.Image.Image, *optional*, None): - Intinal image for generation. - num_inference_steps (int, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - negative_prompt (str or List[str]):https://user-images.githubusercontent.com/29454499/258651862-28b63016-c5ff-4263-9da8-73ca31100165.jpeg - The negative prompt or prompts to guide the image generation. - guidance_scale (float, *optional*, defaults to 7.5): - Guidance scale as defined in Classifier-Free Diffusion Guidance(https://arxiv.org/abs/2207.12598). - guidance_scale is defined as `w` of equation 2. - Higher guidance scale encourages to generate images that are closely linked to the text prompt, - usually at the expense of lower image quality. - eta (float, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [DDIMScheduler], will be ignored for others. - output_type (`str`, *optional*, defaults to "pil"): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): PIL.Image.Image or np.array. - height (int, *optional*, 512): - Generated image height - width (int, *optional*, 512): - Generated image width - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make - generation deterministic. - latents (`torch.FloatTensor`, *optional*): - Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor is generated by sampling using the supplied random `generator`. - Returns: - Dictionary with keys: - sample - the last generated image PIL.Image.Image or np.arrayhttps://huggingface.co/latent-consistency/lcm-lora-sdv1-5 - iterations - *optional* (if gif=True) images for all diffusion steps, List of PIL.Image.Image or np.array. - """ - do_classifier_free_guidance = guidance_scale > 1.0 - # get prompt text embeddings - text_embeddings = self._encode_prompt( - prompt, - do_classifier_free_guidance=do_classifier_free_guidance, - negative_prompt=negative_prompt, - ) - # get ip-adapter image embeddings - image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image) - if do_classifier_free_guidance: - image_embeds = np.concatenate([negative_image_embeds, image_embeds]) - - # set timesteps - accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys()) - extra_set_kwargs = {} - if accepts_offset: - extra_set_kwargs["offset"] = 1 - - self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) - latent_timestep = timesteps[:1] - - # get the initial random noise unless the user supplied it - latents, meta = self.prepare_latents( - 1, - 4, - height or self.height, - width or self.width, - generator=generator, - latents=latents, - image=image, - latent_timestep=latent_timestep, - ) - - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta - - for i, t in enumerate(self.progress_bar(timesteps)): - # expand the latents if you are doing classifier free guidance - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - - # predict the noise residual - noise_pred = self.unet([latent_model_input, t, text_embeddings, image_embeds])[0] - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1] - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step( - torch.from_numpy(noise_pred), - t, - torch.from_numpy(latents), - **extra_step_kwargs, - )["prev_sample"].numpy() - - # scale and decode the image latents with vae - image = self.vae_decoder(latents * (1 / 0.18215))[0] - - image = self.postprocess_image(image, meta, output_type) - return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=False) - - def _encode_prompt( - self, - prompt: Union[str, List[str]], - num_images_per_prompt: int = 1, - do_classifier_free_guidance: bool = True, - negative_prompt: Union[str, List[str]] = None, - ): - """ - Encodes the prompt into text encoder hidden states. - - Parameters: - prompt (str or list(str)): prompt to be encoded - num_images_per_prompt (int): number of images that should be generated per prompt - do_classifier_free_guidance (bool): whether to use classifier free guidance or not - negative_prompt (str or list(str)): negative prompt to be encoded. - Returns: - text_embeddings (np.ndarray): text encoder hidden states - """ - batch_size = len(prompt) if isinstance(prompt, list) else 1 - - # tokenize input prompts - text_inputs = self.tokenizer( - prompt, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="np", - ) - text_input_ids = text_inputs.input_ids - - text_embeddings = self.text_encoder(text_input_ids)[0] - - # duplicate text embeddings for each generation per prompt - if num_images_per_prompt != 1: - bs_embed, seq_len, _ = text_embeddings.shape - text_embeddings = np.tile(text_embeddings, (1, num_images_per_prompt, 1)) - text_embeddings = np.reshape(text_embeddings, (bs_embed * num_images_per_prompt, seq_len, -1)) - - # get unconditional embeddings for classifier free guidance - if do_classifier_free_guidance: - uncond_tokens: List[str] - max_length = text_input_ids.shape[-1] - if negative_prompt is None: - uncond_tokens = [""] * batch_size - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] - else: - uncond_tokens = negative_prompt - uncond_input = self.tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="np", - ) - - uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0] - - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - seq_len = uncond_embeddings.shape[1] - uncond_embeddings = np.tile(uncond_embeddings, (1, num_images_per_prompt, 1)) - uncond_embeddings = np.reshape(uncond_embeddings, (batch_size * num_images_per_prompt, seq_len, -1)) - - # For classifier-free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - text_embeddings = np.concatenate([uncond_embeddings, text_embeddings]) - - return text_embeddings - - def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype=torch.float32, - generator=None, - latents=None, - image=None, - latent_timestep=None, - ): - shape = ( - batch_size, - num_channels_latents, - height // self.vae_scale_factor, - width // self.vae_scale_factor, - ) - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - - if latents is None: - latents = randn_tensor(shape, generator=generator, dtype=dtype) - - if image is None: - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * self.scheduler.init_noise_sigma - return latents.numpy(), {} - input_image, meta = preprocess(image, height, width) - image_latents = self.vae_encoder(input_image)[0] - image_latents = image_latents * 0.18215 - latents = self.scheduler.add_noise(torch.from_numpy(image_latents), latents, latent_timestep).numpy() - return latents, meta - - def postprocess_image(self, image: np.ndarray, meta: Dict, output_type: str = "pil"): - """ - Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initial image size (if required), - normalize and convert to [0, 255] pixels range. Optionally, converts it from np.ndarray to PIL.Image format - - Parameters: - image (np.ndarray): - Generated image - meta (Dict): - Metadata obtained on the latents preparing step can be empty - output_type (str, *optional*, pil): - Output format for result, can be pil or numpy - Returns: - image (List of np.ndarray or PIL.Image.Image): - Post-processed images - """ - if "padding" in meta: - pad = meta["padding"] - (_, end_h), (_, end_w) = pad[1:3] - h, w = image.shape[2:] - unpad_h = h - end_h - unpad_w = w - end_w - image = image[:, :, :unpad_h, :unpad_w] - image = np.clip(image / 2 + 0.5, 0, 1) - image = np.transpose(image, (0, 2, 3, 1)) - # 9. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) - if "src_height" in meta: - orig_height, orig_width = meta["src_height"], meta["src_width"] - image = [img.resize((orig_width, orig_height), PIL.Image.Resampling.LANCZOS) for img in image] - else: - if "src_height" in meta: - orig_height, orig_width = meta["src_height"], meta["src_width"] - image = [cv2.resize(img, (orig_width, orig_width)) for img in image] - return image - - def encode_image(self, image, num_images_per_prompt=1): - if not isinstance(image, torch.Tensor): - image = self.feature_extractor(image, return_tensors="pt").pixel_values - - image_embeds = self.image_encoder(image)[0] - if num_images_per_prompt > 1: - image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) - - uncond_image_embeds = np.zeros(image_embeds.shape) - return image_embeds, uncond_image_embeds - - def get_timesteps(self, num_inference_steps: int, strength: float): - """ - Helper function for getting scheduler timesteps for generation - In case of image-to-image generation, it updates number of steps according to strength - - Parameters: - num_inference_steps (int): - number of inference steps for generation - strength (float): - value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. - Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input. - """ - # get the original timestep using init_timestep - init_timestep = min(int(num_inference_steps * strength), num_inference_steps) - - t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start:] - - return timesteps, num_inference_steps - t_start - -Run model inference -------------------- - - - -Now let’s configure our pipeline and take a look on generation results. - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -Select inference device from dropdown list. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='CPU') - - - -.. code:: ipython3 - - from transformers import AutoTokenizer - - core = ov.Core() - - ov_config = {"INFERENCE_PRECISION_HINT": "f32"} if device.value != "CPU" else {} - vae_decoder = core.compile_model(VAE_DECODER_PATH, device.value, ov_config) - vae_encoder = core.compile_model(VAE_ENCODER_PATH, device.value, ov_config) - text_encoder = core.compile_model(TEXT_ENCODER_PATH, device.value) - image_encoder = core.compile_model(IMAGE_ENCODER_PATH, device.value) - unet = core.compile_model(UNET_PATH, device.value) - - scheduler = LCMScheduler.from_pretrained(models_dir / "scheduler") - tokenizer = AutoTokenizer.from_pretrained(models_dir / "tokenizer") - feature_extractor = CLIPImageProcessor.from_pretrained(models_dir / "feature_extractor") - - ov_pipe = OVStableDiffusionPipeline( - vae_decoder, - text_encoder, - tokenizer, - unet, - scheduler, - image_encoder, - feature_extractor, - vae_encoder, - ) - - -.. parsed-literal:: - - The config attributes {'skip_prk_steps': True} were passed to LCMScheduler, but are not expected and will be ignored. Please verify your scheduler_config.json configuration file. - - -Generation image variation -~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -If we stay input text prompt empty and provide only ip-adapter image, we -can get variation of the same image. - -.. code:: ipython3 - - import matplotlib.pyplot as plt - - - def visualize_results(images, titles): - """ - Helper function for results visualization - - Parameters: - orig_img (PIL.Image.Image): original image - processed_img (PIL.Image.Image): processed image after editing - img1_title (str): title for the image on the left - img2_title (str): title for the image on the right - Returns: - fig (matplotlib.pyplot.Figure): matplotlib generated figure contains drawing result - """ - im_w, im_h = images[0].size - is_horizontal = im_h <= im_w - figsize = (10, 15 * len(images)) if is_horizontal else (15 * len(images), 10) - fig, axs = plt.subplots( - 1 if is_horizontal else len(images), - len(images) if is_horizontal else 1, - figsize=figsize, - sharex="all", - sharey="all", - ) - fig.patch.set_facecolor("white") - list_axes = list(axs.flat) - for a in list_axes: - a.set_xticklabels([]) - a.set_yticklabels([]) - a.get_xaxis().set_visible(False) - a.get_yaxis().set_visible(False) - a.grid(False) - for image, title, ax in zip(images, titles, list_axes): - ax.imshow(np.array(image)) - ax.set_title(title, fontsize=20) - fig.subplots_adjust(wspace=0.0 if is_horizontal else 0.01, hspace=0.01 if is_horizontal else 0.0) - fig.tight_layout() - return fig - -.. code:: ipython3 - - generator = torch.Generator(device="cpu").manual_seed(576) - - test_img_path = Path("load_neg_embed.png") - - if not test_img_path.exists(): - image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png") - image.save(test_img_path) - else: - image = load_image(test_img_path) - - result = ov_pipe( - prompt="", - ip_adapter_image=image, - gaidance_scale=1, - negative_prompt="", - num_inference_steps=4, - generator=generator, - ) - - fig = visualize_results([image, result.images[0]], ["input image", "result"]) - - - -.. parsed-literal:: - - 0%| | 0/4 [00:00`__. -In this tutorial, we consider how to convert and run `Stable Diffusion -from -KerasCV `__ -that employs graph mode execution, which enhances performance by -leveraging graph optimization and enabling parallelism and in the same -time maintains a user-friendly interface for image generation. An -additional part demonstrates how to run optimization with -`NNCF `__ to speed up -pipeline. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Convert Stable Diffusion Pipeline models to - OpenVINO <#convert-stable-diffusion-pipeline-models-to-openvino>`__ - - - `Convert text encoder <#convert-text-encoder>`__ - - `Convert diffusion model <#convert-diffusion-model>`__ - - `Convert decoder <#convert-decoder>`__ - -- `Stable Diffusion Pipeline with - OpenVINO <#stable-diffusion-pipeline-with-openvino>`__ -- `Quantization <#quantization>`__ - - - `Prepare calibration dataset <#prepare-calibration-dataset>`__ - - `Run Quantization <#run-quantization>`__ - - `Run Weight Compression <#run-weight-compression>`__ - - `Compare model file sizes <#compare-model-file-sizes>`__ - - `Compare inference time of the FP16 and INT8 - pipelines <#compare-inference-time-of-the-fp16-and-int8-pipelines>`__ - -- `Interactive Demo <#interactive-demo>`__ - - - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %pip install -q "tensorflow-macos>=2.15; sys_platform == 'darwin' and platform_machine == 'arm64' and python_version > '3.8'" # macOS M1 and M2 - %pip install -q "tensorflow>=2.15; sys_platform == 'darwin' and platform_machine != 'arm64' and python_version > '3.8'" # macOS x86 - %pip install -q "tensorflow>=2.15; sys_platform != 'darwin' and python_version > '3.8'" - %pip install -q keras-cv tf_keras numpy "openvino>=2024.1.0" "gradio>=4.19" datasets "nncf>=2.10.0" - - %pip install -q "matplotlib>=3.4" - -Convert Stable Diffusion Pipeline models to OpenVINO -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Stable Diffusion consists of three parts: - -- A text encoder, which turns your prompt into a latent vector. -- A diffusion model, which repeatedly “denoises” a 64x64 latent image - patch. -- A decoder, which turns the final 64x64 latent patch into a - higher-resolution 512x512 image. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/67365453/2d7950a3-5bad-4670-897b-4d5327278feb - :alt: workflow-diagram - - workflow-diagram - -Let us convert each model to OpenVINO format. - -Import required modules and set constants - -.. code:: ipython3 - - import os - - os.environ["TF_USE_LEGACY_KERAS"] = "1" - - import keras_cv - import openvino as ov - import numpy as np - from pathlib import Path - import requests - - # Fetch `notebook_utils` module - if not Path("notebook_utils.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file, device_widget - - IMAGE_WIDTH = 512 - IMAGE_HEIGHT = 512 - BATCH_SIZE = 1 - MAX_PROMPT_LENGTH = 77 - - - OV_TEXT_ENCODER_MODEL_PATH = Path("models/ov_text_encoder_model.xml") - OV_DIFFUSION_MODEL_PATH = Path("models/ov_diffusion_model.xml") - OV_DECODER_MODEL_PATH = Path("models/ov_decoder_model.xml") - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("stable-diffusion-keras-cv.ipynb") - -Create KerasCV Stable Diffusion pipeline - -.. code:: ipython3 - - pipeline = keras_cv.models.StableDiffusion(img_width=IMAGE_WIDTH, img_height=IMAGE_HEIGHT) - -Convert text encoder -^^^^^^^^^^^^^^^^^^^^ - - - -Text encoder has 2 inputs: ``tokens`` and ``positions``. Specify inputs -shapes and provide example data for model tracing. - -.. code:: ipython3 - - text_encoder_input = { - "tokens": (BATCH_SIZE, MAX_PROMPT_LENGTH), - "positions": (BATCH_SIZE, MAX_PROMPT_LENGTH), - } - - text_encoder_example_input = ( - np.random.randint(len(pipeline.tokenizer.vocab), size=(1, MAX_PROMPT_LENGTH)), - np.expand_dims(np.arange(MAX_PROMPT_LENGTH), axis=0), - ) - - ov_text_encoder = ov.convert_model( - pipeline.text_encoder, - example_input=text_encoder_example_input, - input=text_encoder_input, - ) - ov.save_model(ov_text_encoder, OV_TEXT_ENCODER_MODEL_PATH) - del ov_text_encoder - -Convert diffusion model -^^^^^^^^^^^^^^^^^^^^^^^ - - - -Diffusion model has 3 inputs ``latent``, ``timestep_embedding`` and -``context``. Specify inputs shapes and provide example data for model -tracing. - -.. code:: ipython3 - - diffusion_model_input = { - "latent": [BATCH_SIZE, pipeline.img_height // 8, pipeline.img_width // 8, 4], - "timestep_embedding": [BATCH_SIZE, 320], - "context": [BATCH_SIZE, MAX_PROMPT_LENGTH, 768], - } - - diffusion_model_example_input = ( - np.random.random(size=(1, pipeline.img_height // 8, pipeline.img_width // 8, 4)), - np.random.random(size=(1, 320)), - np.random.random(size=(1, MAX_PROMPT_LENGTH, 768)), - ) - - ov_diffusion_model = ov.convert_model( - pipeline.diffusion_model, - input=diffusion_model_input, - example_input=diffusion_model_example_input, - ) - ov.save_model(ov_diffusion_model, OV_DIFFUSION_MODEL_PATH) - del ov_diffusion_model - -Convert decoder -^^^^^^^^^^^^^^^ - - - -Decoder has 1 input for image latents. Specify input shapes and provide -example data for model tracing. - -.. code:: ipython3 - - decoder_input = [BATCH_SIZE, pipeline.img_height // 8, pipeline.img_width // 8, 4] - - decoder_example_input = np.random.random(size=(1, pipeline.img_height // 8, pipeline.img_width // 8, 4)) - - ov_decoder = ov.convert_model(pipeline.decoder, input=decoder_input, example_input=decoder_example_input) - ov.save_model(ov_decoder, OV_DECODER_MODEL_PATH) - del ov_decoder - -.. code:: ipython3 - - # free memory - import gc - - del pipeline - gc.collect(); - -Stable Diffusion Pipeline with OpenVINO -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Let’s take `KerasCV pipeline -implementation `__ -and replace original models with OpenVINO ones. - -.. code:: ipython3 - - """ - Credits: - - - Original implementation: - https://github.com/CompVis/stable-diffusion - - Initial TF/Keras port: - https://github.com/divamgupta/stable-diffusion-tensorflow - - Keras CV implementation: - https://github.com/keras-team/keras-cv/tree/master/keras_cv/models/stable_diffusion - """ - - import math - import tf_keras as keras - import numpy as np - import tensorflow as tf - from pathlib import Path - - from keras_cv.models.stable_diffusion import SimpleTokenizer - - - if not Path("./constants.py").exists(): - download_file(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/stable-diffusion-keras-cv/constants.py") - from constants import UNCONDITIONAL_TOKENS, ALPHAS_CUMPROD - - - class StableDiffusion: - def __init__(self, text_encoder, diffusion_model, decoder): - # UNet requires multiples of 2**7 = 128 - img_height = round(IMAGE_HEIGHT / 128) * 128 - img_width = round(IMAGE_WIDTH / 128) * 128 - self.img_height = img_height - self.img_width = img_width - - self._tokenizer = None - self._text_encoder = text_encoder - self._diffusion_model = diffusion_model - self._decoder = decoder - - print( - "By using this model checkpoint, you acknowledge that its usage is " - "subject to the terms of the CreativeML Open RAIL-M license at " - "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/LICENSE" - ) - - def text_to_image( - self, - prompt, - negative_prompt=None, - num_steps=50, - unconditional_guidance_scale=7.5, - seed=None, - ): - encoded_text = self.encode_text(prompt) - - return self._generate_image( - encoded_text, - negative_prompt=negative_prompt, - batch_size=BATCH_SIZE, - num_steps=num_steps, - unconditional_guidance_scale=unconditional_guidance_scale, - seed=seed, - ) - - def encode_text(self, prompt): - # Tokenize prompt (i.e. starting context) - inputs = self.tokenizer.encode(prompt) - if len(inputs) > MAX_PROMPT_LENGTH: - raise ValueError(f"Prompt is too long (should be <= {MAX_PROMPT_LENGTH} tokens)") - - phrase = inputs + [49407] * (MAX_PROMPT_LENGTH - len(inputs)) - - phrase = tf.convert_to_tensor([phrase], dtype="int32") - - return self.text_encoder({"tokens": phrase, "positions": self._get_pos_ids()}) - - def text_encoder(self, args): - return self._call_ov_model(self._text_encoder, args) - - def diffusion_model(self, args): - return self._call_ov_model(self._diffusion_model, args) - - def decoder(self, args): - return self._call_ov_model(self._decoder, args) - - def _generate_image( - self, - encoded_text, - negative_prompt=None, - batch_size=BATCH_SIZE, - num_steps=50, - unconditional_guidance_scale=7.5, - diffusion_noise=None, - seed=None, - ): - if diffusion_noise is not None and seed is not None: - raise ValueError( - "`diffusion_noise` and `seed` should not both be passed to " - "`generate_image`. `seed` is only used to generate diffusion " - "noise when it's not already user-specified." - ) - - context = self._expand_tensor(encoded_text, batch_size) - - if negative_prompt is None: - unconditional_context = np.repeat(self._get_unconditional_context(), batch_size, axis=0) - else: - unconditional_context = self.encode_text(negative_prompt) - unconditional_context = self._expand_tensor(unconditional_context, batch_size) - - if diffusion_noise is not None: - diffusion_noise = np.squeeze(diffusion_noise) - - if len(np.shape(diffusion_noise)) == 3: - diffusion_noise = np.repeat(np.expand_dims(diffusion_noise, axis=0), batch_size, axis=0) - latent = diffusion_noise - else: - latent = self._get_initial_diffusion_noise(batch_size, seed) - - # Iterative reverse diffusion stage - num_timesteps = 1000 - ratio = (num_timesteps - 1) / (num_steps - 1) - timesteps = (np.arange(0, num_steps) * ratio).round().astype(np.int64) - - alphas, alphas_prev = self._get_initial_alphas(timesteps) - progbar = keras.utils.Progbar(len(timesteps)) - iteration = 0 - for index, timestep in list(enumerate(timesteps))[::-1]: - latent_prev = latent # Set aside the previous latent vector - t_emb = self._get_timestep_embedding(timestep, batch_size) - - unconditional_latent = self.diffusion_model( - { - "latent": latent, - "timestep_embedding": t_emb, - "context": unconditional_context, - } - ) - - latent = self.diffusion_model( - { - "latent": latent, - "timestep_embedding": t_emb, - "context": context, - } - ) - - latent = np.array(unconditional_latent + unconditional_guidance_scale * (latent - unconditional_latent)) - a_t, a_prev = alphas[index], alphas_prev[index] - # Keras backend array need to cast explicitly - target_dtype = latent_prev.dtype - latent = np.array(latent, target_dtype) - pred_x0 = (latent_prev - math.sqrt(1 - a_t) * latent) / math.sqrt(a_t) - latent = np.array(latent) * math.sqrt(1.0 - a_prev) + math.sqrt(a_prev) * pred_x0 - iteration += 1 - progbar.update(iteration) - - # Decoding stage - decoded = self.decoder(latent) - - decoded = ((decoded + 1) / 2) * 255 - return np.clip(decoded, 0, 255).astype("uint8") - - def _get_unconditional_context(self): - unconditional_tokens = tf.convert_to_tensor([UNCONDITIONAL_TOKENS], dtype="int32") - - unconditional_context = self.text_encoder({"tokens": unconditional_tokens, "positions": self._get_pos_ids()}) - - return unconditional_context - - def _expand_tensor(self, text_embedding, batch_size): - text_embedding = np.squeeze(text_embedding) - if len(text_embedding.shape) == 2: - text_embedding = np.repeat(np.expand_dims(text_embedding, axis=0), batch_size, axis=0) - return text_embedding - - @property - def tokenizer(self): - if self._tokenizer is None: - self._tokenizer = SimpleTokenizer() - return self._tokenizer - - def _call_ov_model(self, ov_model, args): - return ov_model(args)[ov_model.output(0)] - - def _get_timestep_embedding(self, timestep, batch_size, dim=320, max_period=10000): - half = dim // 2 - range = np.array(np.arange(0, half), "float32") - freqs = np.exp(-math.log(max_period) * range / half) - args = tf.convert_to_tensor([timestep], dtype="float32") * freqs - embedding = np.concatenate([np.cos(args), np.sin(args)], 0) - embedding = np.reshape(embedding, [1, -1]) - return np.repeat(embedding, batch_size, axis=0) - - def _get_initial_alphas(self, timesteps): - alphas = [ALPHAS_CUMPROD[t] for t in timesteps] - alphas_prev = [1.0] + alphas[:-1] - - return alphas, alphas_prev - - def _get_initial_diffusion_noise(self, batch_size, seed): - np.random.seed(seed) - return np.random.normal( - size=(batch_size, self.img_height // 8, self.img_width // 8, 4), - ) - - @staticmethod - def _get_pos_ids(): - return np.expand_dims(np.arange(MAX_PROMPT_LENGTH, dtype="int32"), 0) - -Select device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=4, options=('CPU', 'GPU.0', 'GPU.1', 'GPU.2', 'AUTO'), value='AUTO') - - - -Read and compile pipeline models using selected device. - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - ov_text_encoder = core.compile_model(OV_TEXT_ENCODER_MODEL_PATH, device.value) - ov_diffusion_model = core.compile_model(OV_DIFFUSION_MODEL_PATH, device.value) - ov_decoder = core.compile_model(OV_DECODER_MODEL_PATH, device.value) - -.. code:: ipython3 - - import matplotlib.pyplot as plt - - - def plot_images(images): - plt.figure(figsize=(8 * len(images), 10)) - for i in range(len(images)): - plt.subplot(1, len(images), i + 1) - plt.imshow(images[i]) - plt.axis("off") - -Create and run Stable Diffusion pipeline using OpenVINO models. - -.. code:: ipython3 - - ov_pipeline = StableDiffusion(text_encoder=ov_text_encoder, diffusion_model=ov_diffusion_model, decoder=ov_decoder) - - images = ov_pipeline.text_to_image("photograph of an astronaut riding a horse", num_steps=50, seed=80) - - plot_images(images) - - -.. parsed-literal:: - - By using this model checkpoint, you acknowledge that its usage is subject to the terms of the CreativeML Open RAIL-M license at https://raw.githubusercontent.com/CompVis/stable-diffusion/main/LICENSE - 50/50 [==============================] - 65s 1s/step - - - -.. image:: stable-diffusion-keras-cv-with-output_files/stable-diffusion-keras-cv-with-output_23_1.png - - -Quantization -~~~~~~~~~~~~ - - - -`NNCF `__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -According to ``keras_cv.models.StableDiffusion`` structure, the -diffusion model takes up significant portion of the overall pipeline -execution time. Now we will show you how to optimize the UNet part using -`NNCF `__ to reduce -computation cost and speed up the pipeline. Quantizing the rest of the -pipeline does not significantly improve inference performance but can -lead to a substantial degradation of accuracy. That’s why we use weight -compression for ``text_encoder`` and ``decoder`` to reduce memory -footprint. - -For the diffusion model we apply quantization in hybrid mode which means -that we quantize: (1) weights of MatMul and Embedding layers and (2) -activations of other layers. The steps are the following: - -1. Create a calibration dataset for quantization. -2. Collect operations with weights. -3. Run ``nncf.compress_model()`` to compress only the model weights. -4. Run ``nncf.quantize()`` on the compressed model with weighted - operations ignored by providing ``ignored_scope`` parameter. -5. Save the ``INT8`` model using ``openvino.save_model()`` function. - -Please select below whether you would like to run quantization to -improve model inference speed. - - **NOTE**: Quantization is time and memory consuming operation. - Running quantization code below may take some time. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - to_quantize = quantization_widget() - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - import requests - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - ov_int8_pipeline = None - OV_INT8_DIFFUSION_MODEL_PATH = Path("models/ov_int8_diffusion_model.xml") - OV_INT8_TEXT_ENCODER_MODEL_PATH = Path("models/ov_int8_text_encoder_model.xml") - OV_INT8_DECODER_MODEL_PATH = Path("models/ov_int8_decoder_model.xml") - - %load_ext skip_kernel_extension - -Prepare calibration dataset -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -We use a portion of -`conceptual_captions `__ -dataset from Hugging Face as calibration data. To collect intermediate -model inputs for UNet optimization we should customize -``CompiledModel``. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import datasets - import numpy as np - from tqdm.notebook import tqdm - from typing import Any, Dict, List - - - class CompiledModelDecorator(ov.CompiledModel): - def __init__(self, compiled_model: ov.CompiledModel, data_cache: List[Any] = None, keep_prob: float = 0.5): - super().__init__(compiled_model) - self.data_cache = data_cache if data_cache is not None else [] - self.keep_prob = keep_prob - - def __call__(self, *args, **kwargs): - if np.random.rand() <= self.keep_prob: - self.data_cache.append(*args) - return super().__call__(*args, **kwargs) - - - def collect_calibration_data(ov_pipe, calibration_dataset_size: int, num_inference_steps: int = 50) -> List[Dict]: - original_unet = ov_pipe._diffusion_model - calibration_data = [] - ov_pipe._diffusion_model = CompiledModelDecorator(original_unet, calibration_data, keep_prob=0.7) - - dataset = datasets.load_dataset("google-research-datasets/conceptual_captions", split="train", streaming=True, trust_remote_code=True).shuffle(seed=42) - - # Run inference for data collection - pbar = tqdm(total=calibration_dataset_size) - for batch in dataset: - prompt = batch["caption"] - if len(prompt) > MAX_PROMPT_LENGTH: - continue - ov_pipe.text_to_image(prompt, num_steps=num_inference_steps, seed=1) - pbar.update(len(calibration_data) - pbar.n) - if pbar.n >= calibration_dataset_size: - break - - ov_pipe._diffusion_model = original_unet - return calibration_data[:calibration_dataset_size] - -.. code:: ipython3 - - %%skip not $to_quantize.value - - if not OV_INT8_DIFFUSION_MODEL_PATH.exists() : - subset_size = 200 - calibration_data = collect_calibration_data(ov_pipeline, calibration_dataset_size=subset_size) - - -.. parsed-literal:: - - /home/ltalamanova/tmp_venv/lib/python3.11/site-packages/datasets/load.py:1461: FutureWarning: The repository for conceptual_captions contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conceptual_captions - You can avoid this message in future by passing the argument `trust_remote_code=True`. - Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`. - warnings.warn( - - - -.. parsed-literal:: - - 0%| | 0/200 [00:00 bool: - allowed_types_list = ["f16", "f32", "f64"] - const_port_id = 0 - input_tensor = node.input_value(const_port_id) - if input_tensor.get_element_type().get_type_name() in allowed_types_list: - const_node = get_operation_const_op(node, const_port_id) - if const_node is not None: - return True - - return False - - - def collect_ops_with_weights(model): - ops_with_weights = [] - for op in model.get_ops(): - if op.get_type_name() == "MatMul": - constant_node_0 = get_operation_const_op(op, const_port_id=0) - constant_node_1 = get_operation_const_op(op, const_port_id=1) - if constant_node_0 or constant_node_1: - ops_with_weights.append(op.get_friendly_name()) - if op.get_type_name() == "Gather" and is_embedding(op): - ops_with_weights.append(op.get_friendly_name()) - - return ops_with_weights - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import nncf - from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters - - if not OV_INT8_DIFFUSION_MODEL_PATH.exists(): - diffusion_model = core.read_model(OV_DIFFUSION_MODEL_PATH) - unet_ignored_scope = collect_ops_with_weights(diffusion_model) - compressed_diffusion_model = nncf.compress_weights(diffusion_model, ignored_scope=nncf.IgnoredScope(types=['Convolution'])) - quantized_diffusion_model = nncf.quantize( - model=compressed_diffusion_model, - calibration_dataset=nncf.Dataset(calibration_data), - subset_size=subset_size, - model_type=nncf.ModelType.TRANSFORMER, - ignored_scope=nncf.IgnoredScope(names=unet_ignored_scope), - advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alphas=AdvancedSmoothQuantParameters(matmul=-1)) - ) - ov.save_model(quantized_diffusion_model, OV_INT8_DIFFUSION_MODEL_PATH) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - INFO:nncf:98 ignored nodes were found by types in the NNCFGraph - INFO:nncf:Statistics of the bitwidth distribution: - +--------------+---------------------------+-----------------------------------+ - | Num bits (N) | % all parameters (layers) | % ratio-defining parameters | - | | | (layers) | - +==============+===========================+===================================+ - | 8 | 100% (184 / 184) | 100% (184 / 184) | - +--------------+---------------------------+-----------------------------------+ - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - INFO:nncf:184 ignored nodes were found by name in the NNCFGraph - INFO:nncf:128 ignored nodes were found by name in the NNCFGraph - INFO:nncf:Not adding activation input quantizer for operation: 4 diffusion_model/dense_72/MatMul - 8 diffusion_model/dense_72/BiasAdd - 44 diffusion_model/activation/mul_1 - - INFO:nncf:Not adding activation input quantizer for operation: 10 diffusion_model/spatial_transformer/basic_transformer_block/cross_attention_1/dense_81/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 11 diffusion_model/spatial_transformer_1/basic_transformer_block_1/cross_attention_3/dense_91/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 12 diffusion_model/spatial_transformer_1/basic_transformer_block_1/cross_attention_3/dense_92/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 13 diffusion_model/spatial_transformer_10/basic_transformer_block_10/cross_attention_21/dense_196/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 14 diffusion_model/spatial_transformer_10/basic_transformer_block_10/cross_attention_21/dense_197/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 15 diffusion_model/spatial_transformer_11/basic_transformer_block_11/cross_attention_23/dense_207/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 16 diffusion_model/spatial_transformer_11/basic_transformer_block_11/cross_attention_23/dense_208/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 17 diffusion_model/spatial_transformer_12/basic_transformer_block_12/cross_attention_25/dense_218/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 18 diffusion_model/spatial_transformer_12/basic_transformer_block_12/cross_attention_25/dense_219/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 19 diffusion_model/spatial_transformer_13/basic_transformer_block_13/cross_attention_27/dense_229/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 20 diffusion_model/spatial_transformer_13/basic_transformer_block_13/cross_attention_27/dense_230/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 21 diffusion_model/spatial_transformer_14/basic_transformer_block_14/cross_attention_29/dense_240/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 22 diffusion_model/spatial_transformer_14/basic_transformer_block_14/cross_attention_29/dense_241/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 23 diffusion_model/spatial_transformer_15/basic_transformer_block_15/cross_attention_31/dense_251/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 24 diffusion_model/spatial_transformer_15/basic_transformer_block_15/cross_attention_31/dense_252/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 25 diffusion_model/spatial_transformer_2/basic_transformer_block_2/cross_attention_5/dense_102/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 26 diffusion_model/spatial_transformer_2/basic_transformer_block_2/cross_attention_5/dense_103/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 27 diffusion_model/spatial_transformer_3/basic_transformer_block_3/cross_attention_7/dense_113/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 28 diffusion_model/spatial_transformer_3/basic_transformer_block_3/cross_attention_7/dense_114/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 29 diffusion_model/spatial_transformer_4/basic_transformer_block_4/cross_attention_9/dense_124/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 30 diffusion_model/spatial_transformer_4/basic_transformer_block_4/cross_attention_9/dense_125/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 31 diffusion_model/spatial_transformer_5/basic_transformer_block_5/cross_attention_11/dense_135/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 32 diffusion_model/spatial_transformer_5/basic_transformer_block_5/cross_attention_11/dense_136/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 33 diffusion_model/spatial_transformer_6/basic_transformer_block_6/cross_attention_13/dense_148/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 34 diffusion_model/spatial_transformer_6/basic_transformer_block_6/cross_attention_13/dense_149/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 35 diffusion_model/spatial_transformer_7/basic_transformer_block_7/cross_attention_15/dense_163/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 36 diffusion_model/spatial_transformer_7/basic_transformer_block_7/cross_attention_15/dense_164/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 37 diffusion_model/spatial_transformer_8/basic_transformer_block_8/cross_attention_17/dense_174/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 38 diffusion_model/spatial_transformer_8/basic_transformer_block_8/cross_attention_17/dense_175/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 39 diffusion_model/spatial_transformer_9/basic_transformer_block_9/cross_attention_19/dense_185/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 40 diffusion_model/spatial_transformer_9/basic_transformer_block_9/cross_attention_19/dense_186/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 84 diffusion_model/dense_73/MatMul - 122 diffusion_model/dense_73/BiasAdd - 168 diffusion_model/res_block/activation_2/mul_1 - - INFO:nncf:Not adding activation input quantizer for operation: 218 diffusion_model/res_block/dense_74/MatMul - 287 diffusion_model/res_block/dense_74/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 219 diffusion_model/res_block_1/dense_85/MatMul - 288 diffusion_model/res_block_1/dense_85/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 220 diffusion_model/res_block_10/dense_154/MatMul - 289 diffusion_model/res_block_10/dense_154/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 221 diffusion_model/res_block_11/dense_155/MatMul - 290 diffusion_model/res_block_11/dense_155/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 222 diffusion_model/res_block_12/dense_156/MatMul - 291 diffusion_model/res_block_12/dense_156/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 223 diffusion_model/res_block_13/dense_157/MatMul - 292 diffusion_model/res_block_13/dense_157/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 224 diffusion_model/res_block_14/dense_168/MatMul - 293 diffusion_model/res_block_14/dense_168/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 225 diffusion_model/res_block_15/dense_179/MatMul - 294 diffusion_model/res_block_15/dense_179/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 226 diffusion_model/res_block_16/dense_190/MatMul - 295 diffusion_model/res_block_16/dense_190/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 227 diffusion_model/res_block_17/dense_201/MatMul - 296 diffusion_model/res_block_17/dense_201/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 228 diffusion_model/res_block_18/dense_212/MatMul - 297 diffusion_model/res_block_18/dense_212/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 229 diffusion_model/res_block_19/dense_223/MatMul - 298 diffusion_model/res_block_19/dense_223/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 230 diffusion_model/res_block_2/dense_96/MatMul - 299 diffusion_model/res_block_2/dense_96/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 231 diffusion_model/res_block_20/dense_234/MatMul - 300 diffusion_model/res_block_20/dense_234/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 232 diffusion_model/res_block_21/dense_245/MatMul - 301 diffusion_model/res_block_21/dense_245/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 233 diffusion_model/res_block_3/dense_107/MatMul - 302 diffusion_model/res_block_3/dense_107/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 234 diffusion_model/res_block_4/dense_118/MatMul - 303 diffusion_model/res_block_4/dense_118/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 235 diffusion_model/res_block_5/dense_129/MatMul - 304 diffusion_model/res_block_5/dense_129/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 236 diffusion_model/res_block_6/dense_140/MatMul - 305 diffusion_model/res_block_6/dense_140/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 237 diffusion_model/res_block_7/dense_141/MatMul - 306 diffusion_model/res_block_7/dense_141/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 238 diffusion_model/res_block_8/dense_142/MatMul - 307 diffusion_model/res_block_8/dense_142/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 239 diffusion_model/res_block_9/dense_153/MatMul - 308 diffusion_model/res_block_9/dense_153/BiasAdd - - INFO:nncf:Not adding activation input quantizer for operation: 9 diffusion_model/spatial_transformer/basic_transformer_block/cross_attention_1/dense_80/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2355 diffusion_model/spatial_transformer/basic_transformer_block/cross_attention/dense_75/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2356 diffusion_model/spatial_transformer/basic_transformer_block/cross_attention/dense_76/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2357 diffusion_model/spatial_transformer/basic_transformer_block/cross_attention/dense_77/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5423 diffusion_model/spatial_transformer/basic_transformer_block/cross_attention/dense_78/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2691 diffusion_model/spatial_transformer/basic_transformer_block/cross_attention_1/dense_79/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 709 diffusion_model/spatial_transformer/basic_transformer_block/cross_attention_1/dense_82/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2937 diffusion_model/spatial_transformer/basic_transformer_block/geglu/dense_83/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 4990 diffusion_model/spatial_transformer/basic_transformer_block/dense_84/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 4114 diffusion_model/spatial_transformer_1/basic_transformer_block_1/cross_attention_2/dense_86/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 4115 diffusion_model/spatial_transformer_1/basic_transformer_block_1/cross_attention_2/dense_87/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 4116 diffusion_model/spatial_transformer_1/basic_transformer_block_1/cross_attention_2/dense_88/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6228 diffusion_model/spatial_transformer_1/basic_transformer_block_1/cross_attention_2/dense_89/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 4446 diffusion_model/spatial_transformer_1/basic_transformer_block_1/cross_attention_3/dense_90/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 711 diffusion_model/spatial_transformer_1/basic_transformer_block_1/cross_attention_3/dense_93/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2940 diffusion_model/spatial_transformer_1/basic_transformer_block_1/geglu_1/dense_94/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 4993 diffusion_model/spatial_transformer_1/basic_transformer_block_1/dense_95/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5955 diffusion_model/spatial_transformer_2/basic_transformer_block_2/cross_attention_4/dense_97/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5956 diffusion_model/spatial_transformer_2/basic_transformer_block_2/cross_attention_4/dense_98/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5957 diffusion_model/spatial_transformer_2/basic_transformer_block_2/cross_attention_4/dense_99/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6511 diffusion_model/spatial_transformer_2/basic_transformer_block_2/cross_attention_4/dense_100/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6091 diffusion_model/spatial_transformer_2/basic_transformer_block_2/cross_attention_5/dense_101/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 725 diffusion_model/spatial_transformer_2/basic_transformer_block_2/cross_attention_5/dense_104/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2961 diffusion_model/spatial_transformer_2/basic_transformer_block_2/geglu_2/dense_105/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5023 diffusion_model/spatial_transformer_2/basic_transformer_block_2/dense_106/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5962 diffusion_model/spatial_transformer_3/basic_transformer_block_3/cross_attention_6/dense_108/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5963 diffusion_model/spatial_transformer_3/basic_transformer_block_3/cross_attention_6/dense_109/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5964 diffusion_model/spatial_transformer_3/basic_transformer_block_3/cross_attention_6/dense_110/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6513 diffusion_model/spatial_transformer_3/basic_transformer_block_3/cross_attention_6/dense_111/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6099 diffusion_model/spatial_transformer_3/basic_transformer_block_3/cross_attention_7/dense_112/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 727 diffusion_model/spatial_transformer_3/basic_transformer_block_3/cross_attention_7/dense_115/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2964 diffusion_model/spatial_transformer_3/basic_transformer_block_3/geglu_3/dense_116/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5034 diffusion_model/spatial_transformer_3/basic_transformer_block_3/dense_117/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5969 diffusion_model/spatial_transformer_4/basic_transformer_block_4/cross_attention_8/dense_119/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5970 diffusion_model/spatial_transformer_4/basic_transformer_block_4/cross_attention_8/dense_120/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5971 diffusion_model/spatial_transformer_4/basic_transformer_block_4/cross_attention_8/dense_121/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6515 diffusion_model/spatial_transformer_4/basic_transformer_block_4/cross_attention_8/dense_122/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6107 diffusion_model/spatial_transformer_4/basic_transformer_block_4/cross_attention_9/dense_123/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 729 diffusion_model/spatial_transformer_4/basic_transformer_block_4/cross_attention_9/dense_126/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2967 diffusion_model/spatial_transformer_4/basic_transformer_block_4/geglu_4/dense_127/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5058 diffusion_model/spatial_transformer_4/basic_transformer_block_4/dense_128/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5976 diffusion_model/spatial_transformer_5/basic_transformer_block_5/cross_attention_10/dense_130/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5977 diffusion_model/spatial_transformer_5/basic_transformer_block_5/cross_attention_10/dense_131/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5978 diffusion_model/spatial_transformer_5/basic_transformer_block_5/cross_attention_10/dense_132/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6517 diffusion_model/spatial_transformer_5/basic_transformer_block_5/cross_attention_10/dense_133/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6115 diffusion_model/spatial_transformer_5/basic_transformer_block_5/cross_attention_11/dense_134/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 731 diffusion_model/spatial_transformer_5/basic_transformer_block_5/cross_attention_11/dense_137/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2970 diffusion_model/spatial_transformer_5/basic_transformer_block_5/geglu_5/dense_138/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5069 diffusion_model/spatial_transformer_5/basic_transformer_block_5/dense_139/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5983 diffusion_model/spatial_transformer_6/basic_transformer_block_6/cross_attention_12/dense_143/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5984 diffusion_model/spatial_transformer_6/basic_transformer_block_6/cross_attention_12/dense_144/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5985 diffusion_model/spatial_transformer_6/basic_transformer_block_6/cross_attention_12/dense_145/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6519 diffusion_model/spatial_transformer_6/basic_transformer_block_6/cross_attention_12/dense_146/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6123 diffusion_model/spatial_transformer_6/basic_transformer_block_6/cross_attention_13/dense_147/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 733 diffusion_model/spatial_transformer_6/basic_transformer_block_6/cross_attention_13/dense_150/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2973 diffusion_model/spatial_transformer_6/basic_transformer_block_6/geglu_6/dense_151/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5093 diffusion_model/spatial_transformer_6/basic_transformer_block_6/dense_152/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5913 diffusion_model/spatial_transformer_7/basic_transformer_block_7/cross_attention_14/dense_158/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5914 diffusion_model/spatial_transformer_7/basic_transformer_block_7/cross_attention_14/dense_159/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5915 diffusion_model/spatial_transformer_7/basic_transformer_block_7/cross_attention_14/dense_160/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6499 diffusion_model/spatial_transformer_7/basic_transformer_block_7/cross_attention_14/dense_161/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6043 diffusion_model/spatial_transformer_7/basic_transformer_block_7/cross_attention_15/dense_162/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 735 diffusion_model/spatial_transformer_7/basic_transformer_block_7/cross_attention_15/dense_165/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2976 diffusion_model/spatial_transformer_7/basic_transformer_block_7/geglu_7/dense_166/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5104 diffusion_model/spatial_transformer_7/basic_transformer_block_7/dense_167/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5920 diffusion_model/spatial_transformer_8/basic_transformer_block_8/cross_attention_16/dense_169/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5921 diffusion_model/spatial_transformer_8/basic_transformer_block_8/cross_attention_16/dense_170/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5922 diffusion_model/spatial_transformer_8/basic_transformer_block_8/cross_attention_16/dense_171/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6501 diffusion_model/spatial_transformer_8/basic_transformer_block_8/cross_attention_16/dense_172/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6051 diffusion_model/spatial_transformer_8/basic_transformer_block_8/cross_attention_17/dense_173/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 737 diffusion_model/spatial_transformer_8/basic_transformer_block_8/cross_attention_17/dense_176/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2979 diffusion_model/spatial_transformer_8/basic_transformer_block_8/geglu_8/dense_177/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5116 diffusion_model/spatial_transformer_8/basic_transformer_block_8/dense_178/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5927 diffusion_model/spatial_transformer_9/basic_transformer_block_9/cross_attention_18/dense_180/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5928 diffusion_model/spatial_transformer_9/basic_transformer_block_9/cross_attention_18/dense_181/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5929 diffusion_model/spatial_transformer_9/basic_transformer_block_9/cross_attention_18/dense_182/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6503 diffusion_model/spatial_transformer_9/basic_transformer_block_9/cross_attention_18/dense_183/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6059 diffusion_model/spatial_transformer_9/basic_transformer_block_9/cross_attention_19/dense_184/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 739 diffusion_model/spatial_transformer_9/basic_transformer_block_9/cross_attention_19/dense_187/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2982 diffusion_model/spatial_transformer_9/basic_transformer_block_9/geglu_9/dense_188/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5128 diffusion_model/spatial_transformer_9/basic_transformer_block_9/dense_189/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5934 diffusion_model/spatial_transformer_10/basic_transformer_block_10/cross_attention_20/dense_191/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5935 diffusion_model/spatial_transformer_10/basic_transformer_block_10/cross_attention_20/dense_192/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5936 diffusion_model/spatial_transformer_10/basic_transformer_block_10/cross_attention_20/dense_193/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6505 diffusion_model/spatial_transformer_10/basic_transformer_block_10/cross_attention_20/dense_194/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6067 diffusion_model/spatial_transformer_10/basic_transformer_block_10/cross_attention_21/dense_195/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 713 diffusion_model/spatial_transformer_10/basic_transformer_block_10/cross_attention_21/dense_198/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2943 diffusion_model/spatial_transformer_10/basic_transformer_block_10/geglu_10/dense_199/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 4996 diffusion_model/spatial_transformer_10/basic_transformer_block_10/dense_200/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5941 diffusion_model/spatial_transformer_11/basic_transformer_block_11/cross_attention_22/dense_202/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5942 diffusion_model/spatial_transformer_11/basic_transformer_block_11/cross_attention_22/dense_203/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5943 diffusion_model/spatial_transformer_11/basic_transformer_block_11/cross_attention_22/dense_204/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6507 diffusion_model/spatial_transformer_11/basic_transformer_block_11/cross_attention_22/dense_205/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6075 diffusion_model/spatial_transformer_11/basic_transformer_block_11/cross_attention_23/dense_206/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 715 diffusion_model/spatial_transformer_11/basic_transformer_block_11/cross_attention_23/dense_209/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2946 diffusion_model/spatial_transformer_11/basic_transformer_block_11/geglu_11/dense_210/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5008 diffusion_model/spatial_transformer_11/basic_transformer_block_11/dense_211/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5948 diffusion_model/spatial_transformer_12/basic_transformer_block_12/cross_attention_24/dense_213/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5949 diffusion_model/spatial_transformer_12/basic_transformer_block_12/cross_attention_24/dense_214/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5950 diffusion_model/spatial_transformer_12/basic_transformer_block_12/cross_attention_24/dense_215/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6509 diffusion_model/spatial_transformer_12/basic_transformer_block_12/cross_attention_24/dense_216/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6083 diffusion_model/spatial_transformer_12/basic_transformer_block_12/cross_attention_25/dense_217/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 717 diffusion_model/spatial_transformer_12/basic_transformer_block_12/cross_attention_25/dense_220/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2949 diffusion_model/spatial_transformer_12/basic_transformer_block_12/geglu_12/dense_221/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5011 diffusion_model/spatial_transformer_12/basic_transformer_block_12/dense_222/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5900 diffusion_model/spatial_transformer_13/basic_transformer_block_13/cross_attention_26/dense_224/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5901 diffusion_model/spatial_transformer_13/basic_transformer_block_13/cross_attention_26/dense_225/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5902 diffusion_model/spatial_transformer_13/basic_transformer_block_13/cross_attention_26/dense_226/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6497 diffusion_model/spatial_transformer_13/basic_transformer_block_13/cross_attention_26/dense_227/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6030 diffusion_model/spatial_transformer_13/basic_transformer_block_13/cross_attention_27/dense_228/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 719 diffusion_model/spatial_transformer_13/basic_transformer_block_13/cross_attention_27/dense_231/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2952 diffusion_model/spatial_transformer_13/basic_transformer_block_13/geglu_13/dense_232/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5014 diffusion_model/spatial_transformer_13/basic_transformer_block_13/dense_233/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5157 diffusion_model/spatial_transformer_14/basic_transformer_block_14/cross_attention_28/dense_235/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5158 diffusion_model/spatial_transformer_14/basic_transformer_block_14/cross_attention_28/dense_236/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5159 diffusion_model/spatial_transformer_14/basic_transformer_block_14/cross_attention_28/dense_237/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 6386 diffusion_model/spatial_transformer_14/basic_transformer_block_14/cross_attention_28/dense_238/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5447 diffusion_model/spatial_transformer_14/basic_transformer_block_14/cross_attention_29/dense_239/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 721 diffusion_model/spatial_transformer_14/basic_transformer_block_14/cross_attention_29/dense_242/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2955 diffusion_model/spatial_transformer_14/basic_transformer_block_14/geglu_14/dense_243/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5017 diffusion_model/spatial_transformer_14/basic_transformer_block_14/dense_244/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3266 diffusion_model/spatial_transformer_15/basic_transformer_block_15/cross_attention_30/dense_246/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3267 diffusion_model/spatial_transformer_15/basic_transformer_block_15/cross_attention_30/dense_247/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3268 diffusion_model/spatial_transformer_15/basic_transformer_block_15/cross_attention_30/dense_248/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5911 diffusion_model/spatial_transformer_15/basic_transformer_block_15/cross_attention_30/dense_249/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 3531 diffusion_model/spatial_transformer_15/basic_transformer_block_15/cross_attention_31/dense_250/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 723 diffusion_model/spatial_transformer_15/basic_transformer_block_15/cross_attention_31/dense_253/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 2958 diffusion_model/spatial_transformer_15/basic_transformer_block_15/geglu_15/dense_254/Tensordot/MatMul - INFO:nncf:Not adding activation input quantizer for operation: 5020 diffusion_model/spatial_transformer_15/basic_transformer_block_15/dense_255/Tensordot/MatMul - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -Run Weight Compression -^^^^^^^^^^^^^^^^^^^^^^ - - - -Quantizing of the ``text encoder`` and ``decoder`` does not -significantly improve inference performance but can lead to a -substantial degradation of accuracy. The weight compression will be -applied to footprint reduction. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - if not OV_INT8_TEXT_ENCODER_MODEL_PATH.exists(): - text_encoder_model = core.read_model(OV_TEXT_ENCODER_MODEL_PATH) - compressed_text_encoder_model = nncf.compress_weights(text_encoder_model) - ov.save_model(compressed_text_encoder_model, OV_INT8_TEXT_ENCODER_MODEL_PATH) - - if not OV_INT8_DECODER_MODEL_PATH.exists(): - decoder_model = core.read_model(OV_DECODER_MODEL_PATH) - compressed_decoder_model = nncf.compress_weights(decoder_model) - ov.save_model(compressed_decoder_model, OV_INT8_DECODER_MODEL_PATH) - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - +--------------+---------------------------+-----------------------------------+ - | Num bits (N) | % all parameters (layers) | % ratio-defining parameters | - | | | (layers) | - +==============+===========================+===================================+ - | 8 | 100% (74 / 74) | 100% (74 / 74) | - +--------------+---------------------------+-----------------------------------+ - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - INFO:nncf:Statistics of the bitwidth distribution: - +--------------+---------------------------+-----------------------------------+ - | Num bits (N) | % all parameters (layers) | % ratio-defining parameters | - | | | (layers) | - +==============+===========================+===================================+ - | 8 | 100% (40 / 40) | 100% (40 / 40) | - +--------------+---------------------------+-----------------------------------+ - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -Let’s compare the images generated by the original and optimized -pipelines. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - ov_int8_text_encoder = core.compile_model(OV_INT8_TEXT_ENCODER_MODEL_PATH, device.value) - ov_int8_diffusion_model = core.compile_model(OV_INT8_DIFFUSION_MODEL_PATH, device.value) - ov_int8_decoder = core.compile_model(OV_INT8_DECODER_MODEL_PATH, device.value) - - ov_int8_pipeline = StableDiffusion( - text_encoder=ov_int8_text_encoder, diffusion_model=ov_int8_diffusion_model, decoder=ov_int8_decoder, - ) - - int8_image = ov_int8_pipeline.text_to_image( - "photograph of an astronaut riding a horse", - num_steps=50, - seed=80 - )[0] - - -.. parsed-literal:: - - By using this model checkpoint, you acknowledge that its usage is subject to the terms of the CreativeML Open RAIL-M license at https://raw.githubusercontent.com/CompVis/stable-diffusion/main/LICENSE - 50/50 [==============================] - 39s 785ms/step - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import matplotlib.pyplot as plt - - def visualize_results(orig_img, optimized_img): - """ - Helper function for results visualization - - Parameters: - orig_img (Image.Image): generated image using FP16 models - optimized_img (Image.Image): generated image using quantized models - Returns: - fig (matplotlib.pyplot.Figure): matplotlib generated figure contains drawing result - """ - orig_title = "FP16 pipeline" - control_title = "INT8 pipeline" - figsize = (20, 20) - fig, axs = plt.subplots(1, 2, figsize=figsize, sharex='all', sharey='all') - list_axes = list(axs.flat) - for a in list_axes: - a.set_xticklabels([]) - a.set_yticklabels([]) - a.get_xaxis().set_visible(False) - a.get_yaxis().set_visible(False) - a.grid(False) - list_axes[0].imshow(np.array(orig_img)) - list_axes[1].imshow(np.array(optimized_img)) - list_axes[0].set_title(orig_title, fontsize=15) - list_axes[1].set_title(control_title, fontsize=15) - - fig.subplots_adjust(wspace=0.01, hspace=0.01) - fig.tight_layout() - return fig - -.. code:: ipython3 - - %%skip not $to_quantize.value - - visualize_results(images[0], int8_image) - - - -.. image:: stable-diffusion-keras-cv-with-output_files/stable-diffusion-keras-cv-with-output_38_0.png - - -Compare model file sizes -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - fp16_model_paths = [OV_TEXT_ENCODER_MODEL_PATH, OV_DIFFUSION_MODEL_PATH, OV_DECODER_MODEL_PATH] - int8_model_paths = [OV_INT8_TEXT_ENCODER_MODEL_PATH, OV_INT8_DIFFUSION_MODEL_PATH, OV_INT8_DECODER_MODEL_PATH] - - for fp16_path, int8_path in zip(fp16_model_paths, int8_model_paths): - fp16_ir_model_size = fp16_path.with_suffix(".bin").stat().st_size - int8_model_size = int8_path.with_suffix(".bin").stat().st_size - print(f"{fp16_path.stem} compression rate: {fp16_ir_model_size / int8_model_size:.3f}") - - -.. parsed-literal:: - - ov_text_encoder_model compression rate: 1.992 - ov_diffusion_model compression rate: 1.997 - ov_decoder_model compression rate: 1.997 - - -Compare inference time of the FP16 and INT8 pipelines -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To measure the inference performance of the ``FP16`` and ``INT8`` -pipelines, we use median inference time on calibration subset. - - **NOTE**: For the most accurate performance estimation, it is - recommended to run ``benchmark_app`` in a terminal/command prompt - after closing other applications. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import time - - def calculate_inference_time(pipeline, validation_data): - inference_time = [] - for prompt in validation_data: - start = time.perf_counter() - _ = pipeline.text_to_image(prompt, num_steps=50, seed=1) - end = time.perf_counter() - delta = end - start - inference_time.append(delta) - return np.median(inference_time) - -.. code:: ipython3 - - %%skip not $to_quantize.value - - validation_size = 3 - validation_dataset = datasets.load_dataset("google-research-datasets/conceptual_captions", split="train", streaming=True, trust_remote_code=True).take(validation_size) - validation_data = [batch["caption"] for batch in validation_dataset] - - fp_latency = calculate_inference_time(ov_pipeline, validation_data) - int8_latency = calculate_inference_time(ov_int8_pipeline, validation_data) - print(f"Performance speed-up: {fp_latency / int8_latency:.3f}") - - -.. parsed-literal:: - - /home/ltalamanova/tmp_venv/lib/python3.11/site-packages/datasets/load.py:1461: FutureWarning: The repository for conceptual_captions contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conceptual_captions - You can avoid this message in future by passing the argument `trust_remote_code=True`. - Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`. - warnings.warn( - - -.. parsed-literal:: - - 50/50 [==============================] - 65s 1s/step - 50/50 [==============================] - 65s 1s/step - 50/50 [==============================] - 65s 1s/step - 50/50 [==============================] - 39s 785ms/step - 50/50 [==============================] - 39s 783ms/step - 50/50 [==============================] - 39s 784ms/step - Performance speed-up: 1.628 - - -Interactive Demo -~~~~~~~~~~~~~~~~ - - - -Please select below whether you would like to use the quantized model to -launch the interactive demo. - -.. code:: ipython3 - - import ipywidgets as widgets - - use_quantized_model = widgets.Checkbox( - description="Use quantized model", - value=ov_int8_pipeline is not None, - disabled=ov_int8_pipeline is None, - ) - - use_quantized_model - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Use quantized model') - - - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - download_file(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/stable-diffusion-keras-cv/gradio_helper.py") - - from gradio_helper import make_demo - - pipeline = ov_int8_pipeline if use_quantized_model.value else ov_pipeline - - demo = make_demo(pipeline) - - try: - demo.launch(debug=True, height=1000) - except Exception: - demo.launch(share=True, debug=True, height=1000) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/stable-diffusion-keras-cv-with-output_files/stable-diffusion-keras-cv-with-output_23_1.png b/docs/notebooks/stable-diffusion-keras-cv-with-output_files/stable-diffusion-keras-cv-with-output_23_1.png deleted file mode 100644 index f9bd75b3ef2c2f..00000000000000 --- a/docs/notebooks/stable-diffusion-keras-cv-with-output_files/stable-diffusion-keras-cv-with-output_23_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:13a89b75dde82c752fa6bfdcc9d98e3be6968878c0838852fb11cdb5a8f48958 -size 707590 diff --git a/docs/notebooks/stable-diffusion-keras-cv-with-output_files/stable-diffusion-keras-cv-with-output_38_0.png b/docs/notebooks/stable-diffusion-keras-cv-with-output_files/stable-diffusion-keras-cv-with-output_38_0.png deleted file mode 100644 index bd4a81e2438857..00000000000000 --- a/docs/notebooks/stable-diffusion-keras-cv-with-output_files/stable-diffusion-keras-cv-with-output_38_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0cae00d7c793e1486ac63e4aa48e49d8b0e87605fa0f46ec00bc0783fb781dc2 -size 2851972 diff --git a/docs/notebooks/stable-diffusion-text-to-image-with-output.rst b/docs/notebooks/stable-diffusion-text-to-image-with-output.rst deleted file mode 100644 index 00e030d4407697..00000000000000 --- a/docs/notebooks/stable-diffusion-text-to-image-with-output.rst +++ /dev/null @@ -1,645 +0,0 @@ -Text-to-Image Generation with Stable Diffusion and OpenVINO™ -============================================================ - -Stable Diffusion is a text-to-image latent diffusion model created by -the researchers and engineers from -`CompVis `__, `Stability -AI `__ and `LAION `__. It is -trained on 512x512 images from a subset of the -`LAION-5B `__ database. This model uses -a frozen CLIP ViT-L/14 text encoder to condition the model on text -prompts. With its 860M UNet and 123M text encoder. See the `model -card `__ for more -information. - -General diffusion models are machine learning systems that are trained -to denoise random gaussian noise step by step, to get to a sample of -interest, such as an image. Diffusion models have shown to achieve -state-of-the-art results for generating image data. But one downside of -diffusion models is that the reverse denoising process is slow. In -addition, these models consume a lot of memory because they operate in -pixel space, which becomes unreasonably expensive when generating -high-resolution images. Therefore, it is challenging to train these -models and also use them for inference. OpenVINO brings capabilities to -run model inference on Intel hardware and opens the door to the -fantastic world of diffusion models for everyone! - -Model capabilities are not limited text-to-image only, it also is able -solve additional tasks, for example text-guided image-to-image -generation and inpainting. This tutorial also considers how to run -text-guided image-to-image generation using Stable Diffusion. - -This notebook demonstrates how to convert and run stable diffusion model -using OpenVINO. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Prepare Inference Pipelines <#prepare-inference-pipelines>`__ -- `Text-to-image pipeline <#text-to-image-pipeline>`__ - - - `Load Stable Diffusion model and create text-to-image - pipeline <#load-stable-diffusion-model-and-create-text-to-image-pipeline>`__ - - `Text-to-Image generation <#text-to-image-generation>`__ - - `Interactive text-to-image - demo <#interactive-text-to-image-demo>`__ - -- `Image-to-Image pipeline <#image-to-image-pipeline>`__ - - - `Create image-to-Image - pipeline <#create-image-to-image-pipeline>`__ - - `Image-to-Image generation <#image-to-image-generation>`__ - - `Interactive image-to-image - demo <#interactive-image-to-image-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "openvino>=2023.1.0" "git+https://github.com/huggingface/optimum-intel.git" - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "diffusers>=0.9.0" "torch>=2.1" - %pip install -q "huggingface-hub>=0.9.1" - %pip install -q "gradio>=4.19" - %pip install -q transformers Pillow opencv-python tqdm - -.. code:: ipython3 - - # Fetch `notebook_utils` module - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("stable-diffusion-text-to-image.ipynb") - -Prepare Inference Pipelines ---------------------------- - - - -Let us now take a closer look at how the model works in inference by -illustrating the logical flow. - -.. figure:: https://user-images.githubusercontent.com/29454499/260981188-c112dd0a-5752-4515-adca-8b09bea5d14a.png - :alt: sd-pipeline - - sd-pipeline - -As you can see from the diagram, the only difference between -Text-to-Image and text-guided Image-to-Image generation in approach is -how initial latent state is generated. In case of Image-to-Image -generation, you additionally have an image encoded by VAE encoder mixed -with the noise produced by using latent seed, while in Text-to-Image you -use only noise as initial latent state. The stable diffusion model takes -both a latent image representation of size :math:`64 \times 64` and a -text prompt is transformed to text embeddings of size -:math:`77 \times 768` via CLIP’s text encoder as an input. - -Next, the U-Net iteratively *denoises* the random latent image -representations while being conditioned on the text embeddings. The -output of the U-Net, being the noise residual, is used to compute a -denoised latent image representation via a scheduler algorithm. Many -different scheduler algorithms can be used for this computation, each -having its pros and cons. For Stable Diffusion, it is recommended to use -one of: - -- `PNDM - scheduler `__ -- `DDIM - scheduler `__ -- `K-LMS - scheduler `__\ (you - will use it in your pipeline) - -Theory on how the scheduler algorithm function works is out of scope for -this notebook. Nonetheless, in short, you should remember that you -compute the predicted denoised image representation from the previous -noise representation and the predicted noise residual. For more -information, refer to the recommended `Elucidating the Design Space of -Diffusion-Based Generative Models `__ - -The *denoising* process is repeated given number of times (by default -50) to step-by-step retrieve better latent image representations. When -complete, the latent image representation is decoded by the decoder part -of the variational auto encoder. - -Text-to-image pipeline ----------------------- - - - -Load Stable Diffusion model and create text-to-image pipeline -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We will load optimized Stable Diffusion model from the Hugging Face Hub -and create pipeline to run an inference with OpenVINO Runtime by -`Optimum -Intel `__. - -For running the Stable Diffusion model with Optimum Intel, we will use -the ``optimum.intel.OVStableDiffusionPipeline`` class, which represents -the inference pipeline. ``OVStableDiffusionPipeline`` initialized by the -``from_pretrained`` method. It supports on-the-fly conversion models -from PyTorch using the ``export=True`` parameter. A converted model can -be saved on disk using the ``save_pretrained`` method for the next -running. - -When Stable Diffusion models are exported to the OpenVINO format, they -are decomposed into three components that consist of four models -combined during inference into the pipeline: - -- The text encoder - - - The text-encoder is responsible for transforming the input - prompt(for example “a photo of an astronaut riding a horse”) into - an embedding space that can be understood by the U-Net. It is - usually a simple transformer-based encoder that maps a sequence of - input tokens to a sequence of latent text embeddings. - -- The U-NET - - - Model predicts the ``sample`` state for the next step. - -- The VAE encoder - - - The encoder is used to convert the image into a low dimensional - latent representation, which will serve as the input to the U-Net - model. - -- The VAE decoder - - - The decoder transforms the latent representation back into an - image. - -Select device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - device = device_widget() - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - from optimum.intel.openvino import OVStableDiffusionPipeline - from pathlib import Path - - DEVICE = device.value - - MODEL_ID = "prompthero/openjourney" - MODEL_DIR = Path("diffusion_pipeline") - - if not MODEL_DIR.exists(): - ov_pipe = OVStableDiffusionPipeline.from_pretrained(MODEL_ID, export=True, device=DEVICE, compile=False) - ov_pipe.save_pretrained(MODEL_DIR) - else: - ov_pipe = OVStableDiffusionPipeline.from_pretrained(MODEL_DIR, device=DEVICE, compile=False) - - ov_pipe.compile() - - -.. parsed-literal:: - - Compiling the vae_decoder to CPU ... - Compiling the unet to CPU ... - Compiling the text_encoder to CPU ... - Compiling the vae_encoder to CPU ... - - -Text-to-Image generation -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Now, you can define a text prompt for image generation and run inference -pipeline. - - **Note**: Consider increasing ``steps`` to get more precise results. - A suggested value is ``50``, but it will take longer time to process. - -.. code:: ipython3 - - import ipywidgets as widgets - - sample_text = ( - "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting, epic composition. " - "A golden daylight, hyper-realistic environment. " - "Hyper and intricate detail, photo-realistic. " - "Cinematic and volumetric light. " - "Epic concept art. " - "Octane render and Unreal Engine, trending on artstation" - ) - text_prompt = widgets.Text(value=sample_text, description="your text") - num_steps = widgets.IntSlider(min=1, max=50, value=20, description="steps:") - seed = widgets.IntSlider(min=0, max=10000000, description="seed: ", value=42) - widgets.VBox([text_prompt, num_steps, seed]) - - - - -.. parsed-literal:: - - VBox(children=(Text(value='cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour ci… - - - -.. code:: ipython3 - - print("Pipeline settings") - print(f"Input text: {text_prompt.value}") - print(f"Seed: {seed.value}") - print(f"Number of steps: {num_steps.value}") - - -.. parsed-literal:: - - Pipeline settings - Input text: cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting, epic composition. A golden daylight, hyper-realistic environment. Hyper and intricate detail, photo-realistic. Cinematic and volumetric light. Epic concept art. Octane render and Unreal Engine, trending on artstation - Seed: 42 - Number of steps: 20 - - -Let’s generate an image and save the generation results. The pipeline -returns one or several results: ``images`` contains final generated -image. To get more than one result, you can set the -``num_images_per_prompt`` parameter. - -.. code:: ipython3 - - import numpy as np - - np.random.seed(seed.value) - - result = ov_pipe(text_prompt.value, num_inference_steps=num_steps.value) - - final_image = result["images"][0] - final_image.save("result.png") - - - -.. parsed-literal:: - - 0%| | 0/21 [00:00`__ and -`LAION `__. - -General diffusion models are machine learning systems that are trained -to denoise random gaussian noise step by step, to get to a sample of -interest, such as an image. Diffusion models have shown to achieve -state-of-the-art results for generating image data. But one downside of -diffusion models is that the reverse denoising process is slow. In -addition, these models consume a lot of memory because they operate in -pixel space, which becomes unreasonably expensive when generating -high-resolution images. Therefore, it is challenging to train these -models and also use them for inference. OpenVINO brings capabilities to -run model inference on Intel hardware and opens the door to the -fantastic world of diffusion models for everyone! - -This notebook demonstrates how to run stable diffusion model using -`Diffusers `__ library and -`OpenVINO TorchDynamo -backend `__ -for Text-to-Image and Image-to-Image generation tasks. - -Notebook contains the following steps: - -1. Create pipeline with PyTorch models. -2. Add OpenVINO optimization using OpenVINO TorchDynamo backend. -3. Run Stable Diffusion pipeline with OpenVINO. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Stable Diffusion with Diffusers - library <#stable-diffusion-with-diffusers-library>`__ -- `OpenVINO TorchDynamo backend <#openvino-torchdynamo-backend>`__ - - - `Run Image generation <#run-image-generation>`__ - -- `Interactive demo <#interactive-demo>`__ -- `Support for Automatic1111 Stable Diffusion - WebUI <#support-for-automatic1111-stable-diffusion-webui>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "torch>=2.2" transformers diffusers "gradio>=4.19" ipywidgets --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "openvino>=2024.1.0" - -.. code:: ipython3 - - import torch - - from diffusers import StableDiffusionPipeline - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("stable-diffusion-torchdynamo-backend.ipynb") - -Stable Diffusion with Diffusers library ---------------------------------------- - - - -To work with Stable Diffusion v2.1, we will use Hugging Face Diffusers -library. To experiment with Stable Diffusion models, Diffusers exposes -the -`StableDiffusionPipeline `__ -and -`StableDiffusionImg2ImgPipeline `__ -similar to the other `Diffusers -pipelines `__. -The code below demonstrates how to create the -``StableDiffusionPipeline`` using ``stable-diffusion-2-1-base`` model: - -.. code:: ipython3 - - model_id = "stabilityai/stable-diffusion-2-1-base" - - # Pipeline for text-to-image generation - pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float32) - - - -.. parsed-literal:: - - Loading pipeline components...: 0%| | 0/6 [00:00`__ -lets you enable `OpenVINO `__ -support for PyTorch models with minimal changes to the original PyTorch -script. It speeds up PyTorch code by JIT-compiling it into optimized -kernels. By default, Torch code runs in eager-mode, but with the use of -torch.compile it goes through the following steps: - -1. Graph acquisition - the model is rewritten as blocks of subgraphs that are either: - - - compiled by TorchDynamo and “flattened”, - - falling back to the eager-mode, due to unsupported Python constructs (like control-flow - code). - -2. Graph lowering - all PyTorch operations are decomposed into - their constituent kernels specific to the chosen backend. -3. Graph compilation - the kernels call their corresponding low-level - device-specific operations. - -Select device for inference and enable or disable saving the optimized -model files to a hard drive, after the first application run. This makes -them available for the following application executions, reducing the -first-inference latency. Read more about available `Environment -Variables -options `__ - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='CPU') - - - -.. code:: ipython3 - - import ipywidgets as widgets - - model_caching = widgets.Dropdown( - options=[True, False], - value=True, - description="Model caching:", - disabled=False, - ) - - model_caching - - - - -.. parsed-literal:: - - Dropdown(description='Model caching:', options=(True, False), value=True) - - - -To use `torch.compile() -method `__, -you just need to add an import statement and define the OpenVINO -backend: - -.. code:: ipython3 - - # this import is required to activate the openvino backend for torchdynamo - import openvino.torch # noqa: F401 - - pipe.unet = torch.compile( - pipe.unet, - backend="openvino", - options={"device": device.value, "model_caching": model_caching.value}, - ) - - **Note**: Read more about available `OpenVINO - backends `__ - -Run Image generation -~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - prompt = "a photo of an astronaut riding a horse on mars" - image = pipe(prompt).images[0] - image - - - -.. parsed-literal:: - - 0%| | 0/50 [00:00`__. diff --git a/docs/notebooks/stable-diffusion-torchdynamo-backend-with-output_files/stable-diffusion-torchdynamo-backend-with-output_14_1.jpg b/docs/notebooks/stable-diffusion-torchdynamo-backend-with-output_files/stable-diffusion-torchdynamo-backend-with-output_14_1.jpg deleted file mode 100644 index 29f3c227d69265..00000000000000 --- a/docs/notebooks/stable-diffusion-torchdynamo-backend-with-output_files/stable-diffusion-torchdynamo-backend-with-output_14_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:155473a8e74d6f82ce2d42c2459b874ff35f5d8f688094349c601e45d7a06f59 -size 29477 diff --git a/docs/notebooks/stable-diffusion-torchdynamo-backend-with-output_files/stable-diffusion-torchdynamo-backend-with-output_14_1.png b/docs/notebooks/stable-diffusion-torchdynamo-backend-with-output_files/stable-diffusion-torchdynamo-backend-with-output_14_1.png deleted file mode 100644 index 836999e85be825..00000000000000 --- a/docs/notebooks/stable-diffusion-torchdynamo-backend-with-output_files/stable-diffusion-torchdynamo-backend-with-output_14_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:df60fec77f2c7a17c3f3b1b6b0013ed6809af5f67ca74939e9566573d65401f4 -size 440506 diff --git a/docs/notebooks/stable-diffusion-v2-infinite-zoom-with-output.rst b/docs/notebooks/stable-diffusion-v2-infinite-zoom-with-output.rst deleted file mode 100644 index 3752c9618ccca8..00000000000000 --- a/docs/notebooks/stable-diffusion-v2-infinite-zoom-with-output.rst +++ /dev/null @@ -1,483 +0,0 @@ -Infinite Zoom Stable Diffusion v2 and OpenVINO™ -=============================================== - -Stable Diffusion v2 is the next generation of Stable Diffusion model a -Text-to-Image latent diffusion model created by the researchers and -engineers from `Stability AI `__ and -`LAION `__. - -General diffusion models are machine learning systems that are trained -to denoise random gaussian noise step by step, to get to a sample of -interest, such as an image. Diffusion models have shown to achieve -state-of-the-art results for generating image data. But one downside of -diffusion models is that the reverse denoising process is slow. In -addition, these models consume a lot of memory because they operate in -pixel space, which becomes unreasonably expensive when generating -high-resolution images. Therefore, it is challenging to train these -models and also use them for inference. OpenVINO brings capabilities to -run model inference on Intel hardware and opens the door to the -fantastic world of diffusion models for everyone! - -In previous notebooks, we already discussed how to run `Text-to-Image -generation and Image-to-Image generation using Stable Diffusion -v1 `__ -and `controlling its generation process using -ControlNet `__. -Now is turn of Stable Diffusion v2. - -Stable Diffusion v2: What’s new? --------------------------------- - -The new stable diffusion model offers a bunch of new features inspired -by the other models that have emerged since the introduction of the -first iteration. Some of the features that can be found in the new model -are: - -- The model comes with a new robust encoder, OpenCLIP, created by LAION - and aided by Stability AI; this version v2 significantly enhances the - produced photos over the V1 versions. -- The model can now generate images in a 768x768 resolution, offering - more information to be shown in the generated images. -- The model finetuned with - `v-objective `__. The - v-parameterization is particularly useful for numerical stability - throughout the diffusion process to enable progressive distillation - for models. For models that operate at higher resolution, it is also - discovered that the v-parameterization avoids color shifting - artifacts that are known to affect high resolution diffusion models, - and in the video setting it avoids temporal color shifting that - sometimes appears with epsilon-prediction used in Stable Diffusion - v1. -- The model also comes with a new diffusion model capable of running - upscaling on the images generated. Upscaled images can be adjusted up - to 4 times the original image. Provided as separated model, for more - details please check - `stable-diffusion-x4-upscaler `__ -- The model comes with a new refined depth architecture capable of - preserving context from prior generation layers in an image-to-image - setting. This structure preservation helps generate images that - preserving forms and shadow of objects, but with different content. -- The model comes with an updated inpainting module built upon the - previous model. This text-guided inpainting makes switching out parts - in the image easier than before. - -This notebook demonstrates how to download the model from the Hugging -Face Hub and converted to OpenVINO IR format with `Optimum -Intel `__. -And how to use the model to generate sequence of images for infinite -zoom video effect. - - -**Table of contents:** - - -- `Stable Diffusion v2 Infinite Zoom - Showcase <#stable-diffusion-v2-infinite-zoom-showcase>`__ - - - `Stable Diffusion Text guided - Inpainting <#stable-diffusion-text-guided-inpainting>`__ - -- `Prerequisites <#prerequisites>`__ -- `Load Stable Diffusion Inpaint pipeline using Optimum - Intel <#load-stable-diffusion-inpaint-pipeline-using-optimum-intel>`__ -- `Zoom Video Generation <#zoom-video-generation>`__ -- `Run Infinite Zoom video - generation <#run-infinite-zoom-video-generation>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Stable Diffusion v2 Infinite Zoom Showcase ------------------------------------------- - - - -In this tutorial we consider how to use Stable Diffusion v2 model for -generation sequence of images for infinite zoom video effect. To do -this, we will need -`stabilityai/stable-diffusion-2-inpainting `__ -model. - -Stable Diffusion Text guided Inpainting -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -In image editing, inpainting is a process of restoring missing parts of -pictures. Most commonly applied to reconstructing old deteriorated -images, removing cracks, scratches, dust spots, or red-eyes from -photographs. - -But with the power of AI and the Stable Diffusion model, inpainting can -be used to achieve more than that. For example, instead of just -restoring missing parts of an image, it can be used to render something -entirely new in any part of an existing picture. Only your imagination -limits it. - -The workflow diagram explains how Stable Diffusion inpainting pipeline -for inpainting works: - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/22090501/9ac6de45-186f-4a3c-aa20-825825a337eb - :alt: sd2-inpainting - - sd2-inpainting - -The pipeline has a lot of common with Text-to-Image generation pipeline -discussed in previous section. Additionally to text prompt, pipeline -accepts input source image and mask which provides an area of image -which should be modified. Masked image encoded by VAE encoder into -latent diffusion space and concatenated with randomly generated (on -initial step only) or produced by U-Net latent generated image -representation and used as input for next step denoising. - -Using this inpainting feature, decreasing image by certain margin and -masking this border for every new frame we can create interesting Zoom -Out video based on our prompt. - -Prerequisites -------------- - - - -install required packages - -.. code:: ipython3 - - %pip install -q "diffusers>=0.14.0" "transformers>=4.25.1" "gradio>=4.19" "openvino>=2024.2.0" "torch>=2.1" Pillow opencv-python "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu - -Load Stable Diffusion Inpaint pipeline using Optimum Intel ----------------------------------------------------------- - - - -We will load optimized Stable Diffusion model from the Hugging Face Hub -and create pipeline to run an inference with OpenVINO Runtime by -`Optimum -Intel `__. - -For running the Stable Diffusion model with Optimum Intel, we will use -the optimum.intel.OVStableDiffusionInpaintPipeline class, which -represents the inference pipeline. OVStableDiffusionInpaintPipeline -initialized by the from_pretrained method. It supports on-the-fly -conversion models from PyTorch using the export=True parameter. A -converted model can be saved on disk using the save_pretrained method -for the next running. - -Select device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("stable-diffusion-v2-infinite-zoom.ipynb") - - from notebook_utils import device_widget - - device = device_widget() - - device - -.. code:: ipython3 - - from optimum.intel.openvino import OVStableDiffusionInpaintPipeline - from pathlib import Path - - DEVICE = device.value - - MODEL_ID = "stabilityai/stable-diffusion-2-inpainting" - MODEL_DIR = Path("sd2_inpainting") - - if not MODEL_DIR.exists(): - ov_pipe = OVStableDiffusionInpaintPipeline.from_pretrained(MODEL_ID, export=True, device=DEVICE, compile=False) - ov_pipe.save_pretrained(MODEL_DIR) - else: - ov_pipe = OVStableDiffusionInpaintPipeline.from_pretrained(MODEL_DIR, device=DEVICE, compile=False) - - ov_pipe.compile() - -Zoom Video Generation ---------------------- - - - -For achieving zoom effect, we will use inpainting to expand images -beyond their original borders. We run our -``OVStableDiffusionInpaintPipeline`` in the loop, where each next frame -will add edges to previous. The frame generation process illustrated on -diagram below: - -.. figure:: https://user-images.githubusercontent.com/29454499/228739686-436f2759-4c79-42a2-a70f-959fb226834c.png - :alt: frame generation) - - frame generation) - -After processing current frame, we decrease size of current image by -mask size pixels from each side and use it as input for next step. -Changing size of mask we can influence the size of painting area and -image scaling. - -There are 2 zooming directions: - -- Zoom Out - move away from object -- Zoom In - move closer to object - -Zoom In will be processed in the same way as Zoom Out, but after -generation is finished, we record frames in reversed order. - -.. code:: ipython3 - - from typing import List, Union - - import PIL - import cv2 - from tqdm import trange - import numpy as np - - - def generate_video( - pipe, - prompt: Union[str, List[str]], - negative_prompt: Union[str, List[str]], - guidance_scale: float = 7.5, - num_inference_steps: int = 20, - num_frames: int = 20, - mask_width: int = 128, - seed: int = 9999, - zoom_in: bool = False, - ): - """ - Zoom video generation function - - Parameters: - pipe (OVStableDiffusionInpaintingPipeline): inpainting pipeline. - prompt (str or List[str]): The prompt or prompts to guide the image generation. - negative_prompt (str or List[str]): The negative prompt or prompts to guide the image generation. - guidance_scale (float, *optional*, defaults to 7.5): - Guidance scale as defined in Classifier-Free Diffusion Guidance(https://arxiv.org/abs/2207.12598). - guidance_scale is defined as `w` of equation 2. - Higher guidance scale encourages to generate images that are closely linked to the text prompt, - usually at the expense of lower image quality. - num_inference_steps (int, *optional*, defaults to 50): The number of denoising steps for each frame. More denoising steps usually lead to a higher quality image at the expense of slower inference. - num_frames (int, *optional*, 20): number frames for video. - mask_width (int, *optional*, 128): size of border mask for inpainting on each step. - zoom_in (bool, *optional*, False): zoom mode Zoom In or Zoom Out. - Returns: - output_path (str): Path where generated video loacated. - """ - - height = 512 - width = height - - current_image = PIL.Image.new(mode="RGBA", size=(height, width)) - mask_image = np.array(current_image)[:, :, 3] - mask_image = PIL.Image.fromarray(255 - mask_image).convert("RGB") - current_image = current_image.convert("RGB") - init_images = pipe( - prompt=prompt, - negative_prompt=negative_prompt, - image=current_image, - guidance_scale=guidance_scale, - mask_image=mask_image, - num_inference_steps=num_inference_steps, - ).images - - image_grid(init_images, rows=1, cols=1) - - num_outpainting_steps = num_frames - num_interpol_frames = 30 - - current_image = init_images[0] - all_frames = [] - all_frames.append(current_image) - for i in trange( - num_outpainting_steps, - desc=f"Generating {num_outpainting_steps} additional images...", - ): - prev_image_fix = current_image - - prev_image = shrink_and_paste_on_blank(current_image, mask_width) - - current_image = prev_image - - # create mask (black image with white mask_width width edges) - mask_image = np.array(current_image)[:, :, 3] - mask_image = PIL.Image.fromarray(255 - mask_image).convert("RGB") - - # inpainting step - current_image = current_image.convert("RGB") - images = pipe( - prompt=prompt, - negative_prompt=negative_prompt, - image=current_image, - guidance_scale=guidance_scale, - mask_image=mask_image, - num_inference_steps=num_inference_steps, - ).images - current_image = images[0] - current_image.paste(prev_image, mask=prev_image) - - # interpolation steps bewteen 2 inpainted images (=sequential zoom and crop) - for j in range(num_interpol_frames - 1): - interpol_image = current_image - interpol_width = round((1 - (1 - 2 * mask_width / height) ** (1 - (j + 1) / num_interpol_frames)) * height / 2) - interpol_image = interpol_image.crop( - ( - interpol_width, - interpol_width, - width - interpol_width, - height - interpol_width, - ) - ) - - interpol_image = interpol_image.resize((height, width)) - - # paste the higher resolution previous image in the middle to avoid drop in quality caused by zooming - interpol_width2 = round((1 - (height - 2 * mask_width) / (height - 2 * interpol_width)) / 2 * height) - prev_image_fix_crop = shrink_and_paste_on_blank(prev_image_fix, interpol_width2) - interpol_image.paste(prev_image_fix_crop, mask=prev_image_fix_crop) - all_frames.append(interpol_image) - all_frames.append(current_image) - - video_file_name = f"infinite_zoom_{'in' if zoom_in else 'out'}" - fps = 30 - save_path = video_file_name + ".mp4" - write_video(save_path, all_frames, fps, reversed_order=zoom_in) - return save_path - -.. code:: ipython3 - - def shrink_and_paste_on_blank(current_image: PIL.Image.Image, mask_width: int): - """ - Decreases size of current_image by mask_width pixels from each side, - then adds a mask_width width transparent frame, - so that the image the function returns is the same size as the input. - - Parameters: - current_image (PIL.Image): input image to transform - mask_width (int): width in pixels to shrink from each side - Returns: - prev_image (PIL.Image): resized image with extended borders - """ - - height = current_image.height - width = current_image.width - - # shrink down by mask_width - prev_image = current_image.resize((height - 2 * mask_width, width - 2 * mask_width)) - prev_image = prev_image.convert("RGBA") - prev_image = np.array(prev_image) - - # create blank non-transparent image - blank_image = np.array(current_image.convert("RGBA")) * 0 - blank_image[:, :, 3] = 1 - - # paste shrinked onto blank - blank_image[mask_width : height - mask_width, mask_width : width - mask_width, :] = prev_image - prev_image = PIL.Image.fromarray(blank_image) - - return prev_image - - - def image_grid(imgs: List[PIL.Image.Image], rows: int, cols: int): - """ - Insert images to grid - - Parameters: - imgs (List[PIL.Image.Image]): list of images for making grid - rows (int): number of rows in grid - cols (int): number of columns in grid - Returns: - grid (PIL.Image): image with input images collage - """ - assert len(imgs) == rows * cols - - w, h = imgs[0].size - grid = PIL.Image.new("RGB", size=(cols * w, rows * h)) - - for i, img in enumerate(imgs): - grid.paste(img, box=(i % cols * w, i // cols * h)) - return grid - - - def write_video( - file_path: str, - frames: List[PIL.Image.Image], - fps: float, - reversed_order: bool = True, - gif: bool = True, - ): - """ - Writes frames to an mp4 video file and optionaly to gif - - Parameters: - file_path (str): Path to output video, must end with .mp4 - frames (List of PIL.Image): list of frames - fps (float): Desired frame rate - reversed_order (bool): if order of images to be reversed (default = True) - gif (bool): save frames to gif format (default = True) - Returns: - None - """ - if reversed_order: - frames.reverse() - - w, h = frames[0].size - fourcc = cv2.VideoWriter_fourcc("m", "p", "4", "v") - writer = cv2.VideoWriter(file_path, fourcc, fps, (w, h)) - - for frame in frames: - np_frame = np.array(frame.convert("RGB")) - cv_frame = cv2.cvtColor(np_frame, cv2.COLOR_RGB2BGR) - writer.write(cv_frame) - - writer.release() - if gif: - frames[0].save( - file_path.replace(".mp4", ".gif"), - save_all=True, - append_images=frames[1:], - duratiobn=len(frames) / fps, - loop=0, - ) - -Run Infinite Zoom video generation ----------------------------------- - - - -.. code:: ipython3 - - import requests - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/stable-diffusion-v2/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo_zoom_video - - demo = make_demo_zoom_video(ov_pipe, generate_video) - - try: - demo.queue().launch() - except Exception: - demo.queue().launch(share=True) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/stable-diffusion-v2-optimum-demo-with-output.rst b/docs/notebooks/stable-diffusion-v2-optimum-demo-with-output.rst deleted file mode 100644 index aa4e4a55e95df1..00000000000000 --- a/docs/notebooks/stable-diffusion-v2-optimum-demo-with-output.rst +++ /dev/null @@ -1,193 +0,0 @@ -Stable Diffusion v2.1 using Optimum-Intel OpenVINO and multiple Intel Hardware -============================================================================== - -This notebook will provide you a way to see different precision models -performing in different hardware. This notebook was done for showing -case the use of Optimum-Intel-OpenVINO and it is not optimized for -running multiple times. - -|image0| - - -**Table of contents:** - - -- `Showing Info Available Devices <#showing-info-available-devices>`__ -- `Configure Inference Pipeline <#configure-inference-pipeline>`__ -- `Convert model using Optimum <#convert-model-using-optimum>`__ -- `Run model using - OVStableDiffusionPipeline <#run-model-using-ovstablediffusionpipeline>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. |image0| image:: https://github.com/openvinotoolkit/openvino_notebooks/assets/10940214/1858dae4-72fd-401e-b055-66d503d82446 - -Optimum Intel is the interface between the Transformers and Diffusers -libraries and the different tools and libraries provided by Intel to -accelerate end-to-end pipelines on Intel architectures. More details in -this -`repository `__. - -``Note: We suggest you to create a different environment and run the following installation command there.`` - -.. code:: ipython3 - - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" "diffusers>=0.25.0" "openvino>=2024.4.0" "ipywidgets" "transformers>=4.33.0" "torch>=2.1" --extra-index-url https://download.pytorch.org/whl/cpu - -Stable Diffusion pipeline should brings 6 elements together, a text -encoder model with a tokenizer, a UNet model with and scheduler, and an -Autoencoder with Decoder and Encoder models. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/10940214/e166f225-1220-44aa-a987-84471e03947d - :alt: image - - image - -The base model used for this example is the -stabilityai/stable-diffusion-2-1-base. This model was converted to -OpenVINO format, for accelerated inference on CPU or Intel GPU with -OpenVINO’s integration into Optimum. - -.. code:: ipython3 - - import warnings - - warnings.filterwarnings("ignore") - -Showing Info Available Devices -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The ``available_devices`` property shows the available devices in your -system. The “FULL_DEVICE_NAME” option to ``ie.get_property()`` shows the -name of the device. Check what is the ID name for the discrete GPU, if -you have integrated GPU (iGPU) and discrete GPU (dGPU), it will show -``device_name="GPU.0"`` for iGPU and ``device_name="GPU.1"`` for dGPU. -If you just have either an iGPU or dGPU that will be assigned to -``"GPU"`` - -.. code:: ipython3 - - import openvino as ov - import openvino.properties as props - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("stable-diffusion-v2-optimum-demo.ipynb") - - - core = ov.Core() - devices = core.available_devices - - for device in devices: - device_name = core.get_property(device, props.device.full_name) - print(f"{device}: {device_name}") - - -.. parsed-literal:: - - CPU: Intel(R) Core(TM) Ultra 7 155H - GPU: Intel(R) Arc(TM) Graphics (iGPU) - NPU: Intel(R) AI Boost - - -Configure Inference Pipeline -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Select device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget("CPU") - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'GPU', 'NPU', 'AUTO'), value='GPU') - - - -Convert model using Optimum -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from pathlib import Path - - name = "stabilityai/stable-diffusion-2-1-base" - model_dir = Path(name.split("/")[-1]) - - if not model_dir.exists(): - !optimum-cli export openvino -m {name} {model_dir} - -Run model using ``OVStableDiffusionPipeline`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from optimum.intel.openvino import OVStableDiffusionPipeline - - # download and converted SD v2.1 model from Hugging Face Hub - - ov_pipe = OVStableDiffusionPipeline.from_pretrained(model_dir, compile=False, device=device.value) - ov_pipe.reshape(batch_size=1, height=512, width=512, num_images_per_prompt=1) - ov_pipe.compile() - -.. code:: ipython3 - - import gc - - # Generate an image. - prompt = "red car in snowy forest, epic vista, beautiful landscape, 4k, 8k" - output_ov = ov_pipe(prompt, num_inference_steps=17, output_type="pil").images[0] - output_ov.save("image.png") - output_ov - - - -.. parsed-literal:: - - 0%| | 0/18 [00:00`__ -notebook for demo purposes and to get started quickly. This version does -not have the full implementation of the helper utilities needed to -convert the models from PyTorch to OpenVINO, and the OpenVINO -``OVStableDiffusionPipeline`` within the notebook directly. If you would -like to see the full implementation of stable diffusion for text to -image, please visit -`stable-diffusion-v2-text-to-image `__. - - -**Table of contents:** - - -- `Step 0: Install and import - prerequisites <#step-0-install-and-import-prerequisites>`__ -- `Step 1: Stable Diffusion v2 Fundamental - components <#step-1-stable-diffusion-v2-fundamental-components>`__ - - - `Step 1.1: Retrieve components from - HuggingFace <#step-1-1-retrieve-components-from-huggingface>`__ - -- `Step 2: Convert the models to - OpenVINO <#step-2-convert-the-models-to-openvino>`__ -- `Step 3: Text-to-Image Generation Inference - Pipeline <#step-3-text-to-image-generation-inference-pipeline>`__ - - - `Step 3.1: Load and Understand Text to Image OpenVINO - models <#step-3-1-load-and-understand-text-to-image-openvino-models>`__ - - `Step 3.2: Select inference - device <#step-3-2-select-inference-device>`__ - - `Step 3.3: Run Text-to-Image - generation <#step-3-3-run-text-to-image-generation>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Step 0: Install and import prerequisites ----------------------------------------- - - - -To work with Stable Diffusion v2, we will use Hugging Face’s -`Diffusers `__ library. - -To experiment with Stable Diffusion models, Diffusers exposes the -`StableDiffusionPipeline `__ -and ``StableDiffusionInpaintPipeline``, similar to the `other Diffusers -pipelines `__. - -.. code:: ipython3 - - %pip install -q "diffusers>=0.14.0" "openvino>=2023.1.0" "transformers>=4.31" accelerate "torch>=2.1" Pillow opencv-python --extra-index-url https://download.pytorch.org/whl/cpu - -Step 1: Stable Diffusion v2 Fundamental components --------------------------------------------------- - - - -Stable Diffusion pipelines for both Text to Image and Inpainting consist -of three important parts: - -1. A Text Encoder to create conditions: for example, generating an image - from a text prompt or performing inpainting to create an infinite - zoom effect. -2. A U-Net for step-by-step denoising of latent image representation. -3. An Autoencoder (VAE) for decoding the latent space to an image. - -Depending on the pipeline, the parameters for these parts can differ, -which we’ll explore in this demo! - -Step 1.1: Retrieve components from HuggingFace -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Let’s start by retrieving these components from HuggingFace! - -The code below demonstrates how to create ``StableDiffusionPipeline`` -using ``stable-diffusion-2-1``. - -.. code:: ipython3 - - # Retrieve the Text to Image Stable Diffusion pipeline components - from diffusers import StableDiffusionPipeline - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("stable-diffusion-v2-text-to-image-demo.ipynb") - - pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base").to("cpu") - - # for reducing memory consumption get all components from pipeline independently - text_encoder = pipe.text_encoder - text_encoder.eval() - unet = pipe.unet - unet.eval() - vae = pipe.vae - vae.eval() - - conf = pipe.scheduler.config - - del pipe - -Step 2: Convert the models to OpenVINO --------------------------------------- - - - -Now that we’ve retrieved the three parts for both of these pipelines, we -now need to: - -1. Convert the original PyTorch models to OpenVINO format using Model - Conversion API - -:: - - ov_model_part = ov.convert_model(model_part, example_input=input_data) - -2. Save OpenVINO models on disk: - -:: - - ov.save_model(ov_model_part, xml_file_path) - -We can then run our Stable Diffusion v2 text to image and inpainting -pipelines in OpenVINO on our own data! - -.. code:: ipython3 - - from pathlib import Path - - # Define a dir to save text-to-image models - txt2img_model_dir = Path("sd2.1") - txt2img_model_dir.mkdir(exist_ok=True) - -.. code:: ipython3 - - from implementation.conversion_helper_utils import ( - convert_encoder, - convert_unet, - convert_vae_decoder, - convert_vae_encoder, - ) - - # Convert the Text-to-Image models from PyTorch -> OpenVINO - # 1. Convert the Text Encoder - txt_encoder_ov_path = txt2img_model_dir / "text_encoder.xml" - convert_encoder(text_encoder, txt_encoder_ov_path) - # 2. Convert the U-NET - unet_ov_path = txt2img_model_dir / "unet.xml" - convert_unet(unet, unet_ov_path, num_channels=4, width=96, height=96) - # 3. Convert the VAE encoder - vae_encoder_ov_path = txt2img_model_dir / "vae_encoder.xml" - convert_vae_encoder(vae, vae_encoder_ov_path, width=768, height=768) - # 4. Convert the VAE decoder - vae_decoder_ov_path = txt2img_model_dir / "vae_decoder.xml" - convert_vae_decoder(vae, vae_decoder_ov_path, width=96, height=96) - -Step 3: Text-to-Image Generation Inference Pipeline ---------------------------------------------------- - - - -Step 3.1: Load and Understand Text to Image OpenVINO models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Step 3.2: Select inference device -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -Let’s create instances of our OpenVINO Model for Text to Image. - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - - text_enc = core.compile_model(txt_encoder_ov_path, device.value) - -.. code:: ipython3 - - unet_model = core.compile_model(unet_ov_path, device.value) - -.. code:: ipython3 - - vae_encoder = core.compile_model(vae_encoder_ov_path, device.value) - vae_decoder = core.compile_model(vae_decoder_ov_path, device.value) - -Next, we will define a few key elements to create the inference -pipeline, as depicted in the diagram below: - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/22090501/ec454103-0d28-48e3-a18e-b55da3fab381 - :alt: text2img-stable-diffusion - - text2img-stable-diffusion - -As part of the ``OVStableDiffusionPipeline()`` class: - -1. The stable diffusion pipeline takes both a latent seed and a text - prompt as input. The latent seed is used to generate random latent - image representations, and the text prompt is provided to OpenAI’s - CLIP to transform these to text embeddings. - -2. Next, the U-Net model iteratively denoises the random latent image - representations while being conditioned on the text embeddings. The - output of the U-Net, being the noise residual, is used to compute a - denoised latent image representation via a scheduler algorithm. In - this case we use the ``LMSDiscreteScheduler``. - -.. code:: ipython3 - - from diffusers.schedulers import LMSDiscreteScheduler - from transformers import CLIPTokenizer - from implementation.ov_stable_diffusion_pipeline import OVStableDiffusionPipeline - - scheduler = LMSDiscreteScheduler.from_config(conf) - tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") - - ov_pipe = OVStableDiffusionPipeline( - tokenizer=tokenizer, - text_encoder=text_enc, - unet=unet_model, - vae_encoder=vae_encoder, - vae_decoder=vae_decoder, - scheduler=scheduler, - ) - - -.. parsed-literal:: - - /home/ea/work/openvino_notebooks/notebooks/stable-diffusion-v2/implementation/ov_stable_diffusion_pipeline.py:10: FutureWarning: Importing `DiffusionPipeline` or `ImagePipelineOutput` from diffusers.pipeline_utils is deprecated. Please import from diffusers.pipelines.pipeline_utils instead. - from diffusers.pipeline_utils import DiffusionPipeline - - -Step 3.3: Run Text-to-Image generation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Now, let’s define some text prompts for image generation and run our -inference pipeline. - -We can also change our random generator seed for latent state -initialization and number of steps (higher steps = more precise -results). - -Example prompts: - -- “valley in the Alps at sunset, epic vista, beautiful landscape, 4k, - 8k” -- "city filled with cyborgs, modern, industrial, 4k, 8k - -To improve image generation quality, we can use negative prompting. -While positive prompts steer diffusion toward the images associated with -it, negative prompts declares undesired concepts for the generation -image, e.g. if we want to have colorful and bright images, a gray scale -image will be result which we want to avoid. In this case, a gray scale -can be treated as negative prompt. The positive and negative prompt are -in equal footing. You can always use one with or without the other. More -explanation of how it works can be found in this -`article `__. - -.. code:: ipython3 - - import ipywidgets as widgets - - text_prompt = widgets.Textarea( - value="valley in the Alps at sunset, epic vista, beautiful landscape, 4k, 8k", - description="positive prompt", - layout=widgets.Layout(width="auto"), - ) - negative_prompt = widgets.Textarea( - value="frames, borderline, text, charachter, duplicate, error, out of frame, watermark, low quality, ugly, deformed, blur", - description="negative prompt", - layout=widgets.Layout(width="auto"), - ) - num_steps = widgets.IntSlider(min=1, max=50, value=25, description="steps:") - seed = widgets.IntSlider(min=0, max=10000000, description="seed: ", value=42) - widgets.VBox([text_prompt, negative_prompt, seed, num_steps]) - - - - -.. parsed-literal:: - - VBox(children=(Textarea(value='valley in the Alps at sunset, epic vista, beautiful landscape, 4k, 8k', descrip… - - - -.. code:: ipython3 - - # Run inference pipeline - result = ov_pipe( - text_prompt.value, - negative_prompt=negative_prompt.value, - num_inference_steps=num_steps.value, - seed=seed.value, - ) - - - -.. parsed-literal:: - - 0%| | 0/25 [00:00`__ and -`LAION `__. - -General diffusion models are machine learning systems that are trained -to denoise random gaussian noise step by step, to get to a sample of -interest, such as an image. Diffusion models have shown to achieve -state-of-the-art results for generating image data. But one downside of -diffusion models is that the reverse denoising process is slow. In -addition, these models consume a lot of memory because they operate in -pixel space, which becomes unreasonably expensive when generating -high-resolution images. Therefore, it is challenging to train these -models and also use them for inference. OpenVINO brings capabilities to -run model inference on Intel hardware and opens the door to the -fantastic world of diffusion models for everyone! - -In previous notebooks, we already discussed how to run `Text-to-Image -generation and Image-to-Image generation using Stable Diffusion -v1 `__ -and `controlling its generation process using -ControlNet `__. -Now is turn of Stable Diffusion v2. - -Stable Diffusion v2: What’s new? --------------------------------- - -The new stable diffusion model offers a bunch of new features inspired -by the other models that have emerged since the introduction of the -first iteration. Some of the features that can be found in the new model -are: - -- The model comes with a new robust encoder, OpenCLIP, created by LAION - and aided by Stability AI; this version v2 significantly enhances the - produced photos over the V1 versions. -- The model can now generate images in a 768x768 resolution, offering - more information to be shown in the generated images. -- The model finetuned with - `v-objective `__. The - v-parameterization is particularly useful for numerical stability - throughout the diffusion process to enable progressive distillation - for models. For models that operate at higher resolution, it is also - discovered that the v-parameterization avoids color shifting - artifacts that are known to affect high resolution diffusion models, - and in the video setting it avoids temporal color shifting that - sometimes appears with epsilon-prediction used in Stable Diffusion - v1. -- The model also comes with a new diffusion model capable of running - upscaling on the images generated. Upscaled images can be adjusted up - to 4 times the original image. Provided as separated model, for more - details please check - `stable-diffusion-x4-upscaler `__ -- The model comes with a new refined depth architecture capable of - preserving context from prior generation layers in an image-to-image - setting. This structure preservation helps generate images that - preserving forms and shadow of objects, but with different content. -- The model comes with an updated inpainting module built upon the - previous model. This text-guided inpainting makes switching out parts - in the image easier than before. - -This notebook demonstrates how to convert and run Stable Diffusion v2 -model using OpenVINO. - -Notebook contains the following steps: - -1. Create PyTorch models pipeline using Diffusers library. -2. Convert PyTorch models to OpenVINO IR format, using model conversion - API. -3. Apply hybrid post-training quantization to UNet model with - `NNCF `__. -4. Run Stable Diffusion v2 Text-to-Image pipeline with OpenVINO. - -**Note:** This is the full version of the Stable Diffusion text-to-image -implementation. If you would like to get started and run the notebook -quickly, check out `stable-diffusion-v2-text-to-image-demo -notebook `__. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Stable Diffusion v2 for Text-to-Image - Generation <#stable-diffusion-v2-for-text-to-image-generation>`__ - - - `Stable Diffusion in Diffusers - library <#stable-diffusion-in-diffusers-library>`__ - - `Convert models to OpenVINO Intermediate representation (IR) - format <#convert-models-to-openvino-intermediate-representation-ir-format>`__ - - `Text Encoder <#text-encoder>`__ - - `U-Net <#u-net>`__ - - `VAE <#vae>`__ - - `Prepare Inference Pipeline <#prepare-inference-pipeline>`__ - - `Configure Inference Pipeline <#configure-inference-pipeline>`__ - -- `Quantization <#quantization>`__ - - - `Prepare calibration dataset <#prepare-calibration-dataset>`__ - - `Run Hybrid Model Quantization <#run-hybrid-model-quantization>`__ - - `Compare inference time of the FP16 and INT8 - pipelines <#compare-inference-time-of-the-fp16-and-int8-pipelines>`__ - -- `Run Text-to-Image generation <#run-text-to-image-generation>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -install required packages - -.. code:: ipython3 - - %pip install -q "diffusers>=0.14.0" "openvino>=2023.1.0" "datasets>=2.14.6" "transformers>=4.25.1" "gradio>=4.19" "torch>=2.1" Pillow opencv-python --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "nncf>=2.9.0" - -Stable Diffusion v2 for Text-to-Image Generation ------------------------------------------------- - - - -To start, let’s look on Text-to-Image process for Stable Diffusion v2. -We will use `Stable Diffusion -v2-1 `__ model -for these purposes. The main difference from Stable Diffusion v2 and -Stable Diffusion v2.1 is usage of more data, more training, and less -restrictive filtering of the dataset, that gives promising results for -selecting wide range of input text prompts. More details about model can -be found in `Stability AI blog -post `__ -and original model -`repository `__. - -Stable Diffusion in Diffusers library -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -To work with Stable Diffusion -v2, we will use Hugging Face -`Diffusers `__ library. To -experiment with Stable Diffusion models, Diffusers exposes the -`StableDiffusionPipeline `__ -similar to the `other Diffusers -pipelines `__. -The code below demonstrates how to create ``StableDiffusionPipeline`` -using ``stable-diffusion-2-1``: - -.. code:: ipython3 - - from diffusers import StableDiffusionPipeline - - pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base").to("cpu") - - # for reducing memory consumption get all components from pipeline independently - text_encoder = pipe.text_encoder - text_encoder.eval() - unet = pipe.unet - unet.eval() - vae = pipe.vae - vae.eval() - - conf = pipe.scheduler.config - - del pipe - - - -.. parsed-literal:: - - Loading pipeline components...: 0%| | 0/6 [00:00`__ -- `DDIM - scheduler `__ -- `K-LMS - scheduler `__ - -Theory on how the scheduler algorithm function works is out of scope for -this notebook, but in short, you should remember that they compute the -predicted denoised image representation from the previous noise -representation and the predicted noise residual. For more information, -it is recommended to look into `Elucidating the Design Space of -Diffusion-Based Generative Models `__. - -The chart above looks very similar to Stable Diffusion V1 from -`notebook `__, -but there is some small difference in details: - -- Changed input resolution for U-Net model. -- Changed text encoder and as the result size of its hidden state - embeddings. -- Additionally, to improve image generation quality authors introduced - negative prompting. Technically, positive prompt steers the diffusion - toward the images associated with it, while negative prompt steers - the diffusion away from it.In other words, negative prompt declares - undesired concepts for generation image, e.g. if we want to have - colorful and bright image, gray scale image will be result which we - want to avoid, in this case gray scale can be treated as negative - prompt. The positive and negative prompt are in equal footing. You - can always use one with or without the other. More explanation of how - it works can be found in this - `article `__. - -.. code:: ipython3 - - import inspect - from typing import List, Optional, Union, Dict - - import PIL - import cv2 - import torch - - from transformers import CLIPTokenizer - from diffusers import DiffusionPipeline - from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler - - - def scale_fit_to_window(dst_width: int, dst_height: int, image_width: int, image_height: int): - """ - Preprocessing helper function for calculating image size for resize with peserving original aspect ratio - and fitting image to specific window size - - Parameters: - dst_width (int): destination window width - dst_height (int): destination window height - image_width (int): source image width - image_height (int): source image height - Returns: - result_width (int): calculated width for resize - result_height (int): calculated height for resize - """ - im_scale = min(dst_height / image_height, dst_width / image_width) - return int(im_scale * image_width), int(im_scale * image_height) - - - def preprocess(image: PIL.Image.Image): - """ - Image preprocessing function. Takes image in PIL.Image format, resizes it to keep aspect ration and fits to model input window 512x512, - then converts it to np.ndarray and adds padding with zeros on right or bottom side of image (depends from aspect ratio), after that - converts data to float32 data type and change range of values from [0, 255] to [-1, 1], finally, converts data layout from planar NHWC to NCHW. - The function returns preprocessed input tensor and padding size, which can be used in postprocessing. - - Parameters: - image (PIL.Image.Image): input image - Returns: - image (np.ndarray): preprocessed image tensor - meta (Dict): dictionary with preprocessing metadata info - """ - src_width, src_height = image.size - dst_width, dst_height = scale_fit_to_window(512, 512, src_width, src_height) - image = np.array(image.resize((dst_width, dst_height), resample=PIL.Image.Resampling.LANCZOS))[None, :] - pad_width = 512 - dst_width - pad_height = 512 - dst_height - pad = ((0, 0), (0, pad_height), (0, pad_width), (0, 0)) - image = np.pad(image, pad, mode="constant") - image = image.astype(np.float32) / 255.0 - image = 2.0 * image - 1.0 - image = image.transpose(0, 3, 1, 2) - return image, {"padding": pad, "src_width": src_width, "src_height": src_height} - - - class OVStableDiffusionPipeline(DiffusionPipeline): - def __init__( - self, - vae_decoder: ov.Model, - text_encoder: ov.Model, - tokenizer: CLIPTokenizer, - unet: ov.Model, - scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], - vae_encoder: ov.Model = None, - ): - """ - Pipeline for text-to-image generation using Stable Diffusion. - Parameters: - vae_decoder (Model): - Variational Auto-Encoder (VAE) Model to decode images to and from latent representations. - text_encoder (Model): - Frozen text-encoder. Stable Diffusion uses the text portion of - [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically - the clip-vit-large-patch14(https://huggingface.co/openai/clip-vit-large-patch14) variant. - tokenizer (CLIPTokenizer): - Tokenizer of class CLIPTokenizer(https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet (Model): Conditional U-Net architecture to denoise the encoded image latents. - vae_encoder (Model): - Variational Auto-Encoder (VAE) Model to encode images to latent representation. - scheduler (SchedulerMixin): - A scheduler to be used in combination with unet to denoise the encoded image latents. Can be one of - DDIMScheduler, LMSDiscreteScheduler, or PNDMScheduler. - """ - super().__init__() - self.scheduler = scheduler - self.vae_decoder = vae_decoder - self.vae_encoder = vae_encoder - self.text_encoder = text_encoder - self.unet = unet - self.register_to_config(unet=unet) - self._text_encoder_output = text_encoder.output(0) - self._unet_output = unet.output(0) - self._vae_d_output = vae_decoder.output(0) - self._vae_e_output = vae_encoder.output(0) if vae_encoder is not None else None - self.height = self.unet.input(0).shape[2] * 8 - self.width = self.unet.input(0).shape[3] * 8 - self.tokenizer = tokenizer - - def __call__( - self, - prompt: Union[str, List[str]], - image: PIL.Image.Image = None, - negative_prompt: Union[str, List[str]] = None, - num_inference_steps: Optional[int] = 50, - guidance_scale: Optional[float] = 7.5, - eta: Optional[float] = 0.0, - output_type: Optional[str] = "pil", - seed: Optional[int] = None, - strength: float = 1.0, - ): - """ - Function invoked when calling the pipeline for generation. - Parameters: - prompt (str or List[str]): - The prompt or prompts to guide the image generation. - image (PIL.Image.Image, *optional*, None): - Intinal image for generation. - negative_prompt (str or List[str]): - The negative prompt or prompts to guide the image generation. - num_inference_steps (int, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (float, *optional*, defaults to 7.5): - Guidance scale as defined in Classifier-Free Diffusion Guidance(https://arxiv.org/abs/2207.12598). - guidance_scale is defined as `w` of equation 2. - Higher guidance scale encourages to generate images that are closely linked to the text prompt, - usually at the expense of lower image quality. - eta (float, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [DDIMScheduler], will be ignored for others. - output_type (`str`, *optional*, defaults to "pil"): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): PIL.Image.Image or np.array. - seed (int, *optional*, None): - Seed for random generator state initialization. - strength (int, *optional*, 1.0): - strength between initial image and generated in Image-to-Image pipeline, do not used in Text-to-Image - Returns: - Dictionary with keys: - sample - the last generated image PIL.Image.Image or np.array - """ - if seed is not None: - np.random.seed(seed) - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - # get prompt text embeddings - text_embeddings = self._encode_prompt( - prompt, - do_classifier_free_guidance=do_classifier_free_guidance, - negative_prompt=negative_prompt, - ) - # set timesteps - accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys()) - extra_set_kwargs = {} - if accepts_offset: - extra_set_kwargs["offset"] = 1 - - self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) - latent_timestep = timesteps[:1] - - # get the initial random noise unless the user supplied it - latents, meta = self.prepare_latents(image, latent_timestep) - - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta - - for t in self.progress_bar(timesteps): - # expand the latents if we are doing classifier free guidance - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - - # predict the noise residual - noise_pred = self.unet([latent_model_input, np.array(t, dtype=np.float32), text_embeddings])[self._unet_output] - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1] - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs)["prev_sample"].numpy() - # scale and decode the image latents with vae - image = self.vae_decoder(latents * (1 / 0.18215))[self._vae_d_output] - - image = self.postprocess_image(image, meta, output_type) - return {"sample": image} - - def _encode_prompt( - self, - prompt: Union[str, List[str]], - num_images_per_prompt: int = 1, - do_classifier_free_guidance: bool = True, - negative_prompt: Union[str, List[str]] = None, - ): - """ - Encodes the prompt into text encoder hidden states. - - Parameters: - prompt (str or list(str)): prompt to be encoded - num_images_per_prompt (int): number of images that should be generated per prompt - do_classifier_free_guidance (bool): whether to use classifier free guidance or not - negative_prompt (str or list(str)): negative prompt to be encoded - Returns: - text_embeddings (np.ndarray): text encoder hidden states - """ - batch_size = len(prompt) if isinstance(prompt, list) else 1 - - # tokenize input prompts - text_inputs = self.tokenizer( - prompt, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="np", - ) - text_input_ids = text_inputs.input_ids - - text_embeddings = self.text_encoder(text_input_ids)[self._text_encoder_output] - - # duplicate text embeddings for each generation per prompt - if num_images_per_prompt != 1: - bs_embed, seq_len, _ = text_embeddings.shape - text_embeddings = np.tile(text_embeddings, (1, num_images_per_prompt, 1)) - text_embeddings = np.reshape(text_embeddings, (bs_embed * num_images_per_prompt, seq_len, -1)) - - # get unconditional embeddings for classifier free guidance - if do_classifier_free_guidance: - uncond_tokens: List[str] - max_length = text_input_ids.shape[-1] - if negative_prompt is None: - uncond_tokens = [""] * batch_size - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] - else: - uncond_tokens = negative_prompt - uncond_input = self.tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="np", - ) - - uncond_embeddings = self.text_encoder(uncond_input.input_ids)[self._text_encoder_output] - - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - seq_len = uncond_embeddings.shape[1] - uncond_embeddings = np.tile(uncond_embeddings, (1, num_images_per_prompt, 1)) - uncond_embeddings = np.reshape(uncond_embeddings, (batch_size * num_images_per_prompt, seq_len, -1)) - - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - text_embeddings = np.concatenate([uncond_embeddings, text_embeddings]) - - return text_embeddings - - def prepare_latents(self, image: PIL.Image.Image = None, latent_timestep: torch.Tensor = None): - """ - Function for getting initial latents for starting generation - - Parameters: - image (PIL.Image.Image, *optional*, None): - Input image for generation, if not provided randon noise will be used as starting point - latent_timestep (torch.Tensor, *optional*, None): - Predicted by scheduler initial step for image generation, required for latent image mixing with nosie - Returns: - latents (np.ndarray): - Image encoded in latent space - """ - latents_shape = (1, 4, self.height // 8, self.width // 8) - noise = np.random.randn(*latents_shape).astype(np.float32) - if image is None: - # if we use LMSDiscreteScheduler, let's make sure latents are mulitplied by sigmas - if isinstance(self.scheduler, LMSDiscreteScheduler): - noise = noise * self.scheduler.sigmas[0].numpy() - return noise, {} - input_image, meta = preprocess(image) - latents = self.vae_encoder(input_image)[self._vae_e_output] - latents = latents * 0.18215 - latents = self.scheduler.add_noise(torch.from_numpy(latents), torch.from_numpy(noise), latent_timestep).numpy() - return latents, meta - - def postprocess_image(self, image: np.ndarray, meta: Dict, output_type: str = "pil"): - """ - Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initila image size (if required), - normalize and convert to [0, 255] pixels range. Optionally, convertes it from np.ndarray to PIL.Image format - - Parameters: - image (np.ndarray): - Generated image - meta (Dict): - Metadata obtained on latents preparing step, can be empty - output_type (str, *optional*, pil): - Output format for result, can be pil or numpy - Returns: - image (List of np.ndarray or PIL.Image.Image): - Postprocessed images - """ - if "padding" in meta: - pad = meta["padding"] - (_, end_h), (_, end_w) = pad[1:3] - h, w = image.shape[2:] - unpad_h = h - end_h - unpad_w = w - end_w - image = image[:, :, :unpad_h, :unpad_w] - image = np.clip(image / 2 + 0.5, 0, 1) - image = np.transpose(image, (0, 2, 3, 1)) - # 9. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) - if "src_height" in meta: - orig_height, orig_width = meta["src_height"], meta["src_width"] - image = [img.resize((orig_width, orig_height), PIL.Image.Resampling.LANCZOS) for img in image] - else: - if "src_height" in meta: - orig_height, orig_width = meta["src_height"], meta["src_width"] - image = [cv2.resize(img, (orig_width, orig_width)) for img in image] - return image - - def get_timesteps(self, num_inference_steps: int, strength: float): - """ - Helper function for getting scheduler timesteps for generation - In case of image-to-image generation, it updates number of steps according to strength - - Parameters: - num_inference_steps (int): - number of inference steps for generation - strength (float): - value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. - Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input. - """ - # get the original timestep using init_timestep - init_timestep = min(int(num_inference_steps * strength), num_inference_steps) - - t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start:] - - return timesteps, num_inference_steps - t_start - -Configure Inference Pipeline -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -First, you should create instances of OpenVINO Model. - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import device_widget - - device = device_widget() - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("stable-diffusion-v2-text-to-image.ipynb") - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=4, options=('CPU', 'GPU.0', 'GPU.1', 'GPU.2', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - ov_config = {"INFERENCE_PRECISION_HINT": "f32"} if device.value != "CPU" else {} - - - core = ov.Core() - text_enc = core.compile_model(TEXT_ENCODER_OV_PATH, device.value) - unet_model = core.compile_model(UNET_OV_PATH, device.value) - vae_decoder = core.compile_model(VAE_DECODER_OV_PATH, device.value, ov_config) - vae_encoder = core.compile_model(VAE_ENCODER_OV_PATH, device.value, ov_config) - -Model tokenizer and scheduler are also important parts of the pipeline. -Let us define them and put all components together. - -.. code:: ipython3 - - from transformers import CLIPTokenizer - - scheduler = DDIMScheduler.from_config(conf) # DDIMScheduler is used because UNet quantization produces better results with it - tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") - - ov_pipe = OVStableDiffusionPipeline( - tokenizer=tokenizer, - text_encoder=text_enc, - unet=unet_model, - vae_encoder=vae_encoder, - vae_decoder=vae_decoder, - scheduler=scheduler, - ) - -Quantization ------------- - - - -`NNCF `__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -According to ``Stable Diffusion v2`` structure, the UNet model takes up -significant portion of the overall pipeline execution time. Now we will -show you how to optimize the UNet part using -`NNCF `__ to reduce -computation cost and speed up the pipeline. Quantizing the rest of the -pipeline does not significantly improve inference performance but can -lead to a substantial degradation of accuracy. - -For this model we apply quantization in hybrid mode which means that we -quantize: (1) weights of MatMul and Embedding layers and (2) activations -of other layers. The steps are the following: - -1. Create a calibration dataset for quantization. -2. Collect operations with weights. -3. Run ``nncf.compress_model()`` to compress only the model weights. -4. Run ``nncf.quantize()`` on the compressed model with weighted - operations ignored by providing ``ignored_scope`` parameter. -5. Save the ``INT8`` model using ``openvino.save_model()`` function. - -Please select below whether you would like to run quantization to -improve model inference speed. - - **NOTE**: Quantization is time and memory consuming operation. - Running quantization code below may take some time. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - to_quantize = quantization_widget() - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - import requests - from pathlib import Path - - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - int8_ov_pipe = None - - %load_ext skip_kernel_extension - -Prepare calibration dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We use a portion of -`conceptual_captions `__ -dataset from Hugging Face as calibration data. To collect intermediate -model inputs for calibration we should customize ``CompiledModel``. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import datasets - import numpy as np - from tqdm.notebook import tqdm - from typing import Any, Dict, List - - - def disable_progress_bar(pipeline, disable=True): - if not hasattr(pipeline, "_progress_bar_config"): - pipeline._progress_bar_config = {'disable': disable} - else: - pipeline._progress_bar_config['disable'] = disable - - - class CompiledModelDecorator(ov.CompiledModel): - def __init__(self, compiled_model: ov.CompiledModel, data_cache: List[Any] = None, keep_prob: float = 0.5): - super().__init__(compiled_model) - self.data_cache = data_cache if data_cache is not None else [] - self.keep_prob = keep_prob - - def __call__(self, *args, **kwargs): - if np.random.rand() <= self.keep_prob: - self.data_cache.append(*args) - return super().__call__(*args, **kwargs) - - - def collect_calibration_data(ov_pipe, calibration_dataset_size: int, num_inference_steps: int) -> List[Dict]: - original_unet = ov_pipe.unet - calibration_data = [] - ov_pipe.unet = CompiledModelDecorator(original_unet, calibration_data, keep_prob=0.7) - disable_progress_bar(ov_pipe) - - dataset = datasets.load_dataset("google-research-datasets/conceptual_captions", split="train", trust_remote_code=True).shuffle(seed=42) - - # Run inference for data collection - pbar = tqdm(total=calibration_dataset_size) - for batch in dataset: - prompt = batch["caption"] - if len(prompt) > ov_pipe.tokenizer.model_max_length: - continue - ov_pipe(prompt, num_inference_steps=num_inference_steps, seed=1) - pbar.update(len(calibration_data) - pbar.n) - if pbar.n >= calibration_dataset_size: - break - - disable_progress_bar(ov_pipe, disable=False) - ov_pipe.unet = original_unet - return calibration_data - -Run Hybrid Model Quantization -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from collections import deque - from transformers import set_seed - import nncf - - def get_operation_const_op(operation, const_port_id: int): - node = operation.input_value(const_port_id).get_node() - queue = deque([node]) - constant_node = None - allowed_propagation_types_list = ["Convert", "FakeQuantize", "Reshape"] - - while len(queue) != 0: - curr_node = queue.popleft() - if curr_node.get_type_name() == "Constant": - constant_node = curr_node - break - if len(curr_node.inputs()) == 0: - break - if curr_node.get_type_name() in allowed_propagation_types_list: - queue.append(curr_node.input_value(0).get_node()) - - return constant_node - - - def is_embedding(node) -> bool: - allowed_types_list = ["f16", "f32", "f64"] - const_port_id = 0 - input_tensor = node.input_value(const_port_id) - if input_tensor.get_element_type().get_type_name() in allowed_types_list: - const_node = get_operation_const_op(node, const_port_id) - if const_node is not None: - return True - - return False - - - def collect_ops_with_weights(model): - ops_with_weights = [] - for op in model.get_ops(): - if op.get_type_name() == "MatMul": - constant_node_0 = get_operation_const_op(op, const_port_id=0) - constant_node_1 = get_operation_const_op(op, const_port_id=1) - if constant_node_0 or constant_node_1: - ops_with_weights.append(op.get_friendly_name()) - if op.get_type_name() == "Gather" and is_embedding(op): - ops_with_weights.append(op.get_friendly_name()) - - return ops_with_weights - - UNET_INT8_OV_PATH = sd2_1_model_dir / 'unet_optimized.xml' - if not UNET_INT8_OV_PATH.exists(): - calibration_dataset_size = 300 - set_seed(1) - unet_calibration_data = collect_calibration_data(ov_pipe, - calibration_dataset_size=calibration_dataset_size, - num_inference_steps=50) - - unet = core.read_model(UNET_OV_PATH) - - # Collect operations which weights will be compressed - unet_ignored_scope = collect_ops_with_weights(unet) - - # Compress model weights - compressed_unet = nncf.compress_weights(unet, ignored_scope=nncf.IgnoredScope(types=['Convolution'])) - - # Quantize both weights and activations of Convolution layers - quantized_unet = nncf.quantize( - model=compressed_unet, - calibration_dataset=nncf.Dataset(unet_calibration_data), - subset_size=calibration_dataset_size, - model_type=nncf.ModelType.TRANSFORMER, - ignored_scope=nncf.IgnoredScope(names=unet_ignored_scope), - advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=-1) - ) - - ov.save_model(quantized_unet, UNET_INT8_OV_PATH) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - int8_unet_model = core.compile_model(UNET_INT8_OV_PATH, device.value) - int8_ov_pipe = OVStableDiffusionPipeline( - tokenizer=tokenizer, - text_encoder=text_enc, - unet=int8_unet_model, - vae_encoder=vae_encoder, - vae_decoder=vae_decoder, - scheduler=scheduler - ) - -Compare UNet file size -~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: ipython3 - - %%skip not $to_quantize.value - - fp16_ir_model_size = UNET_OV_PATH.with_suffix(".bin").stat().st_size / 1024 - quantized_model_size = UNET_INT8_OV_PATH.with_suffix(".bin").stat().st_size / 1024 - - print(f"FP16 model size: {fp16_ir_model_size:.2f} KB") - print(f"INT8 model size: {quantized_model_size:.2f} KB") - print(f"Model compression rate: {fp16_ir_model_size / quantized_model_size:.3f}") - - -.. parsed-literal:: - - FP16 model size: 1691232.51 KB - INT8 model size: 846918.58 KB - Model compression rate: 1.997 - - -Compare inference time of the FP16 and INT8 pipelines -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To measure the inference performance of the ``FP16`` and ``INT8`` -pipelines, we use median inference time on calibration subset. - - **NOTE**: For the most accurate performance estimation, it is - recommended to run ``benchmark_app`` in a terminal/command prompt - after closing other applications. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import time - - def calculate_inference_time(pipeline, validation_data): - inference_time = [] - pipeline.set_progress_bar_config(disable=True) - for prompt in validation_data: - start = time.perf_counter() - _ = pipeline(prompt, num_inference_steps=10, seed=0) - end = time.perf_counter() - delta = end - start - inference_time.append(delta) - return np.median(inference_time) - -.. code:: ipython3 - - %%skip not $to_quantize.value - - validation_size = 10 - validation_dataset = datasets.load_dataset("google-research-datasets/conceptual_captions", split="train", streaming=True, trust_remote_code=True).take(validation_size) - validation_data = [batch["caption"] for batch in validation_dataset] - - fp_latency = calculate_inference_time(ov_pipe, validation_data) - int8_latency = calculate_inference_time(int8_ov_pipe, validation_data) - print(f"Performance speed-up: {fp_latency / int8_latency:.3f}") - - -.. parsed-literal:: - - /home/nsavel/venvs/ov_notebooks_tmp/lib/python3.8/site-packages/datasets/load.py:1429: FutureWarning: The repository for conceptual_captions contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conceptual_captions - You can avoid this message in future by passing the argument `trust_remote_code=True`. - Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`. - warnings.warn( - - -.. parsed-literal:: - - Performance speed-up: 1.232 - - -Run Text-to-Image generation ----------------------------- - - - -Now, you can define a text prompts for image generation and run -inference pipeline. Optionally, you can also change the random generator -seed for latent state initialization and number of steps. - - **Note**: Consider increasing ``steps`` to get more precise results. - A suggested value is ``50``, but it will take longer time to process. - -Please select below whether you would like to use the quantized model to -launch the interactive demo. - -.. code:: ipython3 - - import ipywidgets as widgets - - quantized_model_present = int8_ov_pipe is not None - - use_quantized_model = widgets.Checkbox( - value=quantized_model_present, - description="Use quantized model", - disabled=not quantized_model_present, - ) - - use_quantized_model - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Use quantized model') - - - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/stable-diffusion-v2/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - pipeline = int8_ov_pipe if use_quantized_model.value else ov_pipe - - demo = make_demo(pipeline) - - try: - demo.queue().launch() - except Exception: - demo.queue().launch(share=True) diff --git a/docs/notebooks/stable-diffusion-v3-torch-fx-with-output.rst b/docs/notebooks/stable-diffusion-v3-torch-fx-with-output.rst deleted file mode 100644 index 4abeae0511cb8f..00000000000000 --- a/docs/notebooks/stable-diffusion-v3-torch-fx-with-output.rst +++ /dev/null @@ -1,561 +0,0 @@ -Image generation with Torch.FX Stable Diffusion v3 and OpenVINO -=============================================================== - -Stable Diffusion V3 is next generation of latent diffusion image Stable -Diffusion models family that outperforms state-of-the-art text-to-image -generation systems in typography and prompt adherence, based on human -preference evaluations. In comparison with previous versions, it based -on Multimodal Diffusion Transformer (MMDiT) text-to-image model that -features greatly improved performance in image quality, typography, -complex prompt understanding, and resource-efficiency. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/dd079427-89f2-4d28-a10e-c80792d750bf - :alt: mmdit.png - - mmdit.png - -More details about model can be found in `model -card `__, -`research -paper `__ -and `Stability.AI blog -post `__. In this -tutorial, we will demonstrate the optimize stable diffusion 3 in a Torch -FX representation using NNCF -`NNCF `__ for model -optimization. Additionally, we will accelerate the pipeline further by -running with torch.compile using the openvino backend. If you want to -run previous Stable Diffusion versions, please check our other -notebooks: - -- `Stable Diffusion `__ -- `Stable Diffusion v2 `__ -- `Stable Diffusion v3 `__ -- `Stable Diffusion XL `__ -- `LCM Stable - Diffusion `__ -- `Turbo SDXL `__ -- `Turbo SD `__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Build PyTorch pipeline <#build-pytorch-pipeline>`__ - - - `Store the Configs <#store-the-configs>`__ - -- `Run FP Inference <#run-fp-inference>`__ -- `Convert models to Torch FX <#convert-models-to-torch-fx>`__ -- `Quantization <#quantization>`__ - - - `Collect Calibration Dataset <#collect-calibration-dataset>`__ - - `Compress and Quantize models <#compress-and-quantize-models>`__ - - `Create Optimized Pipeline <#create-optimized-pipeline>`__ - - `Check File Size <#check-file-size>`__ - - `Optimized pipeline inference <#optimized-pipeline-inference>`__ - - `Visualize Results <#visualize-results>`__ - -- `Interactive demo <#interactive-demo>`__ - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "gradio>=4.19" "torch>=2.5" "torchvision>=0.20" "numpy<2.0" "transformers" "datasets>=2.14.6" "opencv-python" "pillow" "peft>=0.7.0" "diffusers>=0.31.0" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -qU "openvino>=2024.3.0" - %pip install -q "nncf>=2.14.0" "typing_extensions>=4.11" - -.. code:: ipython3 - - from pathlib import Path - - import requests - - if not Path("sd3_torch_fx_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/stable-diffusion-v3/sd3_torch_fx_helper.py") - open("sd3_torch_fx_helper.py", "w").write(r.text) - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/stable-diffusion-v3/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - if not Path("notebook_utils.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("notebook_utils.py", "w").write(r.text) - - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("stable-diffusion-v3-torch-fx.ipynb") - -Build PyTorch pipeline ----------------------- - - - - **Note**: run model with notebook, you will need to accept license - agreement. You must be a registered user in Hugging Face Hub. - Please visit `HuggingFace model - card `__, - carefully read terms of usage and click accept button. You will need - to use an access token for the code below to run. For more - information on access tokens, refer to `this section of the - documentation `__. - You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: ipython3 - - # uncomment these lines to login to huggingfacehub to get access to pretrained model - - # from huggingface_hub import notebook_login, whoami - - # try: - # whoami() - # print('Authorization token already provided') - # except OSError: - # notebook_login() - -.. code:: ipython3 - - from sd3_torch_fx_helper import get_sd3_pipeline, init_pipeline - - pipe = get_sd3_pipeline() - pipe.to("cpu") - -Store the Configs -~~~~~~~~~~~~~~~~~ - - - -This will be used later when wrapping the Torch FX models to insert back -into the pipeline - -.. code:: ipython3 - - configs_dict = {} - configs_dict["text_encoder"] = pipe.text_encoder.config - configs_dict["text_encoder_2"] = pipe.text_encoder_2.config - configs_dict["transformer"] = pipe.transformer.config - configs_dict["vae"] = pipe.vae.config - - pipe_config = pipe.config - -Run FP Inference ----------------- - - - -.. code:: ipython3 - - import numpy as np - import torch - - generator = torch.Generator(device="cpu").manual_seed(42) - prompt = "A raccoon trapped inside a glass jar full of colorful candies, the background is steamy with vivid colors" - num_inference_steps = 28 - with torch.no_grad(): - image = pipe( - prompt=prompt, negative_prompt="", num_inference_steps=num_inference_steps, generator=generator, guidance_scale=5, height=512, width=512 - ).images[0] - image - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - -Convert models to Torch FX --------------------------- - - - -This step converts the pytorch models in the hf pipeline to Torch FX -representation using the ``capture_pre_autograd()`` function. - -The pipeline consists of four important parts: - -- Clip and T5 Text Encoders to create condition to generate an image - from a text prompt. -- Transformer for step-by-step denoising latent image representation. -- Autoencoder (VAE) for decoding latent space to image. - -.. code:: ipython3 - - import torch - from nncf.torch.dynamic_graph.patch_pytorch import disable_patching - - text_encoder_input = torch.ones((1, 77), dtype=torch.long) - text_encoder_kwargs = {} - text_encoder_kwargs["output_hidden_states"] = True - - vae_encoder_input = torch.ones((1, 3, 64, 64)) - vae_decoder_input = torch.ones((1, 16, 64, 64)) - - unet_kwargs = {} - unet_kwargs["hidden_states"] = torch.ones((2, 16, 64, 64)) - unet_kwargs["timestep"] = torch.from_numpy(np.array([1, 2], dtype=np.float32)) - unet_kwargs["encoder_hidden_states"] = torch.ones((2, 154, 4096)) - unet_kwargs["pooled_projections"] = torch.ones((2, 2048)) - - with torch.no_grad(): - with disable_patching(): - text_encoder = torch.export.export_for_training( - pipe.text_encoder.eval(), - args=(text_encoder_input,), - kwargs=(text_encoder_kwargs), - ).module() - text_encoder_2 = torch.export.export_for_training( - pipe.text_encoder_2.eval(), - args=(text_encoder_input,), - kwargs=(text_encoder_kwargs), - ).module() - pipe.vae.decoder = torch.export.export_for_training(pipe.vae.decoder.eval(), args=(vae_decoder_input,)).module() - pipe.vae.encoder = torch.export.export_for_training(pipe.vae.encoder.eval(), args=(vae_encoder_input,)).module() - vae = pipe.vae - transformer = torch.export.export_for_training(pipe.transformer.eval(), args=(), kwargs=(unet_kwargs)).module() - models_dict = {} - models_dict["transformer"] = transformer - models_dict["vae"] = vae - models_dict["text_encoder"] = text_encoder - models_dict["text_encoder_2"] = text_encoder_2 - del unet_kwargs - del vae_encoder_input - del vae_decoder_input - del text_encoder_input - del text_encoder_kwargs - del pipe - -Quantization ------------- - - - -`NNCF `__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -According to ``StableDiffusion3Pipeline`` structure, the ``transformer`` -model takes up significant portion of the overall pipeline execution -time. Now we will show you how to optimize the transformer part using -`NNCF `__ to reduce -computation cost and speed up the pipeline. Quantizing the rest of the -pipeline does not significantly improve inference performance but can -lead to a substantial degradation of accuracy. That’s why we use 8-bit -weight compression for the rest of the pipeline to reduce memory -footprint. - -Please select below whether you would like to run quantization to -improve model inference speed. - - **NOTE**: Quantization is time and memory consuming operation. - Running quantization code below may take some time. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - to_quantize = quantization_widget() - - to_quantize - -Let’s load ``skip magic`` extension to skip quantization if -``to_quantize`` is not selected - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - - %load_ext skip_kernel_extension - -Collect Calibration Dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from typing import Any, Dict, List - - import datasets - from diffusers.models.transformers.transformer_sd3 import SD3Transformer2DModel - from tqdm.notebook import tqdm - - - def disable_progress_bar(pipeline, disable=True): - if not hasattr(pipeline, "_progress_bar_config"): - pipeline._progress_bar_config = {"disable": disable} - else: - pipeline._progress_bar_config["disable"] = disable - - - class UNetWrapper(SD3Transformer2DModel): - def __init__(self, transformer, config): - super().__init__(**config) - self.transformer = transformer - self.captured_args = [] - - def forward(self, *args, **kwargs): - del kwargs["joint_attention_kwargs"] - del kwargs["return_dict"] - self.captured_args.append((*args, *tuple(kwargs.values()))) - return self.transformer(*args, **kwargs) - - - def collect_calibration_data( - pipe, calibration_dataset_size: int, num_inference_steps: int - ) -> List[Dict]: - - original_unet = pipe.transformer - calibration_data = [] - disable_progress_bar(pipe) - - dataset = datasets.load_dataset( - "google-research-datasets/conceptual_captions", - split="train", - trust_remote_code=True, - ).shuffle(seed=42) - - transformer_config = dict(pipe.transformer.config) - if "model" in transformer_config: - del transformer_config["model"] - wrapped_unet = UNetWrapper(pipe.transformer.model, transformer_config) - pipe.transformer = wrapped_unet - # Run inference for data collection - pbar = tqdm(total=calibration_dataset_size) - for i, batch in enumerate(dataset): - prompt = batch["caption"] - if len(prompt) > pipe.tokenizer.model_max_length: - continue - # Run the pipeline - pipe(prompt, num_inference_steps=num_inference_steps, height=512, width=512) - calibration_data.extend(wrapped_unet.captured_args) - wrapped_unet.captured_args = [] - pbar.update(len(calibration_data) - pbar.n) - if pbar.n >= calibration_dataset_size: - break - - disable_progress_bar(pipe, disable=False) - pipe.transformer = original_unet - return calibration_data - - - if to_quantize: - pipe = init_pipeline(models_dict, configs_dict) - calibration_dataset_size = 200 - unet_calibration_data = collect_calibration_data( - pipe, calibration_dataset_size=calibration_dataset_size, num_inference_steps=28 - ) - del pipe - -Compress and Quantize models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import nncf - from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters - from nncf.quantization.range_estimator import RangeEstimatorParametersSet - - text_encoder = models_dict["text_encoder"] - text_encoder_2 = models_dict["text_encoder_2"] - vae_encoder = models_dict["vae"].encoder - vae_decoder = models_dict["vae"].decoder - original_transformer = models_dict["transformer"] - if to_quantize: - with disable_patching(): - with torch.no_grad(): - nncf.compress_weights(text_encoder) - nncf.compress_weights(text_encoder_2) - nncf.compress_weights(vae_encoder) - nncf.compress_weights(vae_decoder) - quantized_transformer = nncf.quantize( - model=original_transformer, - calibration_dataset=nncf.Dataset(unet_calibration_data), - subset_size=len(unet_calibration_data), - model_type=nncf.ModelType.TRANSFORMER, - ignored_scope=nncf.IgnoredScope(names=["conv2d"]), - advanced_parameters=nncf.AdvancedQuantizationParameters( - weights_range_estimator_params=RangeEstimatorParametersSet.MINMAX, - activations_range_estimator_params=RangeEstimatorParametersSet.MINMAX, - ), - ) - - optimized_models_dict = {} - optimized_models_dict["transformer"] = quantized_transformer - optimized_models_dict["vae"] = vae - optimized_models_dict["text_encoder"] = text_encoder - optimized_models_dict["text_encoder_2"] = text_encoder_2 - del models_dict - -.. code:: ipython3 - - %%skip not $to_quantize.value - import openvino.torch - - optimized_models_dict["text_encoder"] = torch.compile( - optimized_models_dict["text_encoder"], backend="openvino" - ) - optimized_models_dict["text_encoder_2"] = torch.compile( - optimized_models_dict["text_encoder_2"], backend="openvino" - ) - optimized_models_dict["vae"].encoder = torch.compile( - optimized_models_dict["vae"].encoder, backend="openvino" - ) - optimized_models_dict["vae"].decoder = torch.compile( - optimized_models_dict["vae"].decoder, backend="openvino" - ) - optimized_models_dict["transformer"] = torch.compile( - optimized_models_dict["transformer"], backend="openvino" - ) - -Create Optimized Pipeline -~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Initialize the optimized pipeline using the optimized models - -.. code:: ipython3 - - %%skip not $to_quantize.value - - opt_pipe = init_pipeline(optimized_models_dict, configs_dict) - -Check File Size -~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - - def get_model_size(models): - total_size = 0 - for model in models: - param_size = 0 - for param in model.parameters(): - param_size += param.nelement() * param.element_size() - buffer_size = 0 - for buffer in model.buffers(): - buffer_size += buffer.nelement() * buffer.element_size() - - model_size_mb = (param_size + buffer_size) / 1024**2 - - total_size += model_size_mb - return total_size - - - optimized_model_size = get_model_size([opt_pipe.transformer]) - original_model_size = get_model_size([original_transformer]) - - print(f"Original Transformer Size: {original_model_size} MB") - print(f"Optimized Transformer Size: {optimized_model_size} MB") - print(f"Compression Rate: {original_model_size / optimized_model_size:.3f}") - -Optimized pipeline inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Run inference with single step to compile the model. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - # Warmup the model for initial compile - with torch.no_grad(): - opt_pipe( - prompt=prompt, negative_prompt="", num_inference_steps=1, generator=generator, height=512, width=512 - ).images[0] - -Visualize Results -~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from sd3_torch_fx_helper import visualize_results - - generator = torch.Generator(device="cpu").manual_seed(42) - opt_image = opt_pipe( - prompt, - negative_prompt="", - num_inference_steps=28, - guidance_scale=5, - generator=generator, - height=512, - width=512 - ).images[0] - - visualize_results(image, opt_image) - -Interactive demo ----------------- - - - -Please select below whether you would like to use the quantized models -to launch the interactive demo. - -.. code:: ipython3 - - use_quantized_models = quantization_widget() - - use_quantized_models - -.. code:: ipython3 - - from gradio_helper import make_demo - - fx_pipe = init_pipeline(models_dict if not to_quantize.value else optimized_models_dict, configs_dict) - demo = make_demo(fx_pipe, False) - - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # if you have any issue to launch on your platform, you can pass share=True to launch method: - # demo.launch(share=True) - # it creates a publicly shareable link for the interface. Read more in the docs: https://gradio.app/docs/ - try: - demo.launch(debug=True) - except Exception: - demo.launch(debug=True, share=True) diff --git a/docs/notebooks/stable-diffusion-v3-with-output.rst b/docs/notebooks/stable-diffusion-v3-with-output.rst deleted file mode 100644 index 02d2412cc0b0bf..00000000000000 --- a/docs/notebooks/stable-diffusion-v3-with-output.rst +++ /dev/null @@ -1,646 +0,0 @@ -Image generation with Stable Diffusion v3 and OpenVINO -====================================================== - -Stable Diffusion V3 is next generation of latent diffusion image Stable -Diffusion models family that outperforms state-of-the-art text-to-image -generation systems in typography and prompt adherence, based on human -preference evaluations. In comparison with previous versions, it based -on Multimodal Diffusion Transformer (MMDiT) text-to-image model that -features greatly improved performance in image quality, typography, -complex prompt understanding, and resource-efficiency. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/dd079427-89f2-4d28-a10e-c80792d750bf - :alt: mmdit.png - - mmdit.png - -More details about model can be found in `model -card `__, -`research -paper `__ -and `Stability.AI blog -post `__. In this -tutorial, we will consider how to convert Stable Diffusion v3 for -running with OpenVINO. An additional part demonstrates how to run -optimization with `NNCF `__ to -speed up pipeline. If you want to run previous Stable Diffusion -versions, please check our other notebooks: - -- `Stable Diffusion `__ -- `Stable Diffusion v2 `__ -- `Stable Diffusion XL `__ -- `LCM Stable - Diffusion `__ -- `Turbo SDXL `__ -- `Turbo SD `__ - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Build PyTorch pipeline <#build-pytorch-pipeline>`__ -- `Convert models with OpenVINO <#convert-models-with-openvino>`__ - - - `Transformer <#transformer>`__ - - `T5 Text Encoder <#t5-text-encoder>`__ - - `Clip text encoders <#clip-text-encoders>`__ - - `VAE <#vae>`__ - -- `Prepare OpenVINO inference - pipeline <#prepare-openvino-inference-pipeline>`__ -- `Run OpenVINO model <#run-openvino-model>`__ -- `Quantization <#quantization>`__ - - - `Prepare calibration dataset <#prepare-calibration-dataset>`__ - - `Run Quantization <#run-quantization>`__ - - `Run Weights Compression <#run-weights-compression>`__ - - `Compare model file sizes <#compare-model-file-sizes>`__ - - `Compare inference time of the FP16 and optimized - pipelines <#compare-inference-time-of-the-fp16-and-optimized-pipelines>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "git+https://github.com/initml/diffusers.git@clement/feature/flash_sd3" "gradio>=4.19" "torch>=2.1" "transformers" "nncf>=2.12.0" "datasets>=2.14.6" "opencv-python" "pillow" "peft>=0.7.0" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -qU "openvino>=2024.3.0" - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("sd3_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/stable-diffusion-v3/sd3_helper.py") - open("sd3_helper.py", "w").write(r.text) - - if not Path("sd3_quantization_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/stable-diffusion-v3/sd3_quantization_helper.py") - open("sd3_quantization_helper.py", "w").write(r.text) - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/stable-diffusion-v3/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - if not Path("notebook_utils.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") - open("notebook_utils.py", "w").write(r.text) - - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("stable-diffusion-v3.ipynb") - -Build PyTorch pipeline ----------------------- - - - - **Note**: run model with notebook, you will need to accept license - agreement. You must be a registered user in Hugging Face Hub. - Please visit `HuggingFace model - card `__, - carefully read terms of usage and click accept button. You will need - to use an access token for the code below to run. For more - information on access tokens, refer to `this section of the - documentation `__. - You can login on Hugging Face Hub in notebook environment, using - following code: - -.. code:: ipython3 - - # uncomment these lines to login to huggingfacehub to get access to pretrained model - - # from huggingface_hub import notebook_login, whoami - - # try: - # whoami() - # print('Authorization token already provided') - # except OSError: - # notebook_login() - -We will use -`Diffusers `__ -library integration for running Stable Diffusion v3 model. You can find -more details in Diffusers -`documentation `__. -Additionally, we can apply optimization for pipeline performance and -memory consumption: - -- **Use flash SD3**. Flash Diffusion is a diffusion distillation method - proposed in `Flash Diffusion: Accelerating Any Conditional Diffusion - Model for Few Steps Image - Generation `__. The model - represented as a 90.4M LoRA distilled version of SD3 model that is - able to generate 1024x1024 images in 4 steps. If you want disable it, - you can unset checkbox **Use flash SD3** -- **Remove T5 text encoder**. Removing the memory-intensive 4.7B - parameter T5-XXL text encoder during inference can significantly - decrease the memory requirements for SD3 with only a slight loss in - performance. If you want to use this model in pipeline, please set - **use t5 text encoder** checkbox. - -.. code:: ipython3 - - from sd3_helper import get_pipeline_options - - pt_pipeline_options, use_flash_lora, load_t5 = get_pipeline_options() - - display(pt_pipeline_options) - - -.. parsed-literal:: - - /home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/diffusers/models/transformers/transformer_2d.py:34: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 1.0.0. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.modeling_outputs import Transformer2DModelOutput`, instead. - deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message) - 2024-08-08 08:15:46.648328: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-08-08 08:15:46.650527: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used. - 2024-08-08 08:15:46.687530: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-08-08 08:15:47.368728: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - - - -.. parsed-literal:: - - VBox(children=(Checkbox(value=True, description='Use flash SD3'), Checkbox(value=False, description='Use t5 te… - - -Convert models with OpenVINO ----------------------------- - - - -Starting from 2023.0 release, OpenVINO supports PyTorch models directly -via Model Conversion API. ``ov.convert_model`` function accepts instance -of PyTorch model and example inputs for tracing and returns object of -``ov.Model`` class, ready to use or save on disk using ``ov.save_model`` -function. - -The pipeline consists of four important parts: - -- Clip and T5 Text Encoders to create condition to generate an image - from a text prompt. -- Transformer for step-by-step denoising latent image representation. -- Autoencoder (VAE) for decoding latent space to image. - -We will use ``convert_sd3`` helper function defined in -`sd3_helper.py `__ that create original PyTorch model -and convert each part of pipeline using ``ov.convert_model``. - -.. code:: ipython3 - - from sd3_helper import convert_sd3 - - # Uncomment the line beolow to see model conversion code - # ??convert_sd3 - -.. code:: ipython3 - - convert_sd3(load_t5.value, use_flash_lora.value) - - -.. parsed-literal:: - - SD3 model already converted - - -Prepare OpenVINO inference pipeline ------------------------------------ - - - -.. code:: ipython3 - - from sd3_helper import OVStableDiffusion3Pipeline, init_pipeline # noqa: F401 - - # Uncomment line below to see pipeline code - # ??OVStableDiffusion3Pipeline - -Run OpenVINO model ------------------- - - - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - from sd3_helper import TEXT_ENCODER_PATH, TEXT_ENCODER_2_PATH, TEXT_ENCODER_3_PATH, TRANSFORMER_PATH, VAE_DECODER_PATH - - models_dict = {"transformer": TRANSFORMER_PATH, "vae": VAE_DECODER_PATH, "text_encoder": TEXT_ENCODER_PATH, "text_encoder_2": TEXT_ENCODER_2_PATH} - - if load_t5.value: - models_dict["text_encoder_3"] = TEXT_ENCODER_3_PATH - - ov_pipe = init_pipeline(models_dict, device.value, use_flash_lora.value) - - -.. parsed-literal:: - - Models compilation - transformer - Done! - vae - Done! - text_encoder - Done! - text_encoder_2 - Done! - - -.. code:: ipython3 - - import torch - - image = ov_pipe( - "A raccoon trapped inside a glass jar full of colorful candies, the background is steamy with vivid colors", - negative_prompt="", - num_inference_steps=28 if not use_flash_lora.value else 4, - guidance_scale=5 if not use_flash_lora.value else 0, - height=512, - width=512, - generator=torch.Generator().manual_seed(141), - ).images[0] - image - - - -.. parsed-literal:: - - 0%| | 0/4 [00:00`__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -According to ``OVStableDiffusion3Pipeline`` structure, the -``transformer`` model takes up significant portion of the overall -pipeline execution time. Now we will show you how to optimize the UNet -part using `NNCF `__ to reduce -computation cost and speed up the pipeline. Quantizing the rest of the -pipeline does not significantly improve inference performance but can -lead to a substantial degradation of accuracy. That’s why we use 4-bit -weight compression for the rest of the pipeline to reduce memory -footprint. - -Please select below whether you would like to run quantization to -improve model inference speed. - - **NOTE**: Quantization is time and memory consuming operation. - Running quantization code below may take some time. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - from sd3_quantization_helper import TRANSFORMER_INT8_PATH, TEXT_ENCODER_INT4_PATH, TEXT_ENCODER_2_INT4_PATH, TEXT_ENCODER_3_INT4_PATH, VAE_DECODER_INT4_PATH - - to_quantize = quantization_widget() - - to_quantize - - -.. parsed-literal:: - - huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... - To disable this warning, you can either: - - Avoid using `tokenizers` before the fork if possible - - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -Let’s load ``skip magic`` extension to skip quantization if -``to_quantize`` is not selected - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - optimized_pipe = None - - opt_models_dict = { - "transformer": TRANSFORMER_INT8_PATH, - "text_encoder": TEXT_ENCODER_INT4_PATH, - "text_encoder_2": TEXT_ENCODER_2_INT4_PATH, - "vae": VAE_DECODER_INT4_PATH, - } - - if TEXT_ENCODER_3_PATH.exists(): - opt_models_dict["text_encoder_3"] = TEXT_ENCODER_3_INT4_PATH - - %load_ext skip_kernel_extension - -Prepare calibration dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We use a portion of -`google-research-datasets/conceptual_captions `__ -dataset from Hugging Face as calibration data. We use prompts below to -guide image generation and to determine what not to include in the -resulting image. - -To collect intermediate model inputs for calibration we should customize -``CompiledModel``. We should set the height and width of the image to -512 to reduce memory consumption during quantization. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from sd3_quantization_helper import collect_calibration_data, TRANSFORMER_INT8_PATH - - # Uncomment the line to see calibration data collection code - # ??collect_calibration_data - - -Run Quantization -~~~~~~~~~~~~~~~~ - - - -Quantization of the first ``Convolution`` layer impacts the generation -results. We recommend using ``IgnoredScope`` to keep accuracy sensitive -layers in FP16 precision. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import nncf - import gc - import openvino as ov - - core = ov.Core() - - - if not TRANSFORMER_INT8_PATH.exists(): - calibration_dataset_size = 200 - print("Calibration data collection started") - unet_calibration_data = collect_calibration_data(ov_pipe, - calibration_dataset_size=calibration_dataset_size, - num_inference_steps=28 if not use_flash_lora.value else 4, - guidance_scale=5 if not use_flash_lora.value else 0 - ) - print("Calibration data collection finished") - - del ov_pipe - gc.collect() - ov_pipe = None - - transformer = core.read_model(TRANSFORMER_PATH) - quantized_model = nncf.quantize( - model=transformer, - calibration_dataset=nncf.Dataset(unet_calibration_data), - subset_size=calibration_dataset_size, - model_type=nncf.ModelType.TRANSFORMER, - ignored_scope=nncf.IgnoredScope(names=["__module.model.base_model.model.pos_embed.proj.base_layer/aten::_convolution/Convolution"]), - ) - - ov.save_model(quantized_model, TRANSFORMER_INT8_PATH) - -Run Weights Compression -~~~~~~~~~~~~~~~~~~~~~~~ - - - -Quantizing of the ``Text Encoders`` and ``Autoencoder`` does not -significantly improve inference performance but can lead to a -substantial degradation of accuracy. - -For reducing model memory consumption we will use weights compression. -The `Weights -Compression `__ -algorithm is aimed at compressing the weights of the models and can be -used to optimize the model footprint and performance of large models -where the size of weights is relatively larger than the size of -activations, for example, Large Language Models (LLM). Compared to INT8 -compression, INT4 compression improves performance even more, but -introduces a minor drop in prediction quality. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from sd3_quantization_helper import compress_models - - compress_models() - - -.. parsed-literal:: - - Compressed text_encoder can be found in stable-diffusion-3/text_encoder_int4.xml - Compressed text_encoder_2 can be found in stable-diffusion-3/text_encoder_2_int4.xml - Compressed vae_decoder can be found in stable-diffusion-3/vae_decoder_int4.xml - - -Let’s compare the images generated by the original and optimized -pipelines. - -.. code:: ipython3 - - %%skip not $to_quantize.value - optimized_pipe = init_pipeline(opt_models_dict, device.value, use_flash_lora.value) - - -.. parsed-literal:: - - Models compilation - transformer - Done! - text_encoder - Done! - text_encoder_2 - Done! - vae - Done! - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from sd3_quantization_helper import visualize_results - - opt_image = optimized_pipe( - "A raccoon trapped inside a glass jar full of colorful candies, the background is steamy with vivid colors", - negative_prompt="", - num_inference_steps=28 if not use_flash_lora.value else 4, - guidance_scale=5 if not use_flash_lora.value else 0, - height=512, - width=512, - generator=torch.Generator().manual_seed(141), - ).images[0] - - visualize_results(image, opt_image) - - - -.. parsed-literal:: - - 0%| | 0/4 [00:00`__ consists of an `ensemble of -experts `__ pipeline for latent -diffusion: In the first step, the base model is used to generate (noisy) -latents, which are then further processed with a refinement model -specialized for the final denoising steps. Note that the base model can -be used as a standalone module or in a two-stage pipeline as follows: -First, the base model is used to generate latents of the desired output -size. In the second step, we use a specialized high-resolution model and -apply a technique called -`SDEdit `__\ ( also known as “image to -image”) to the latents generated in the first step, using the same -prompt. - -Compared to previous versions of Stable Diffusion, SDXL leverages a -three times larger UNet backbone: The increase of model parameters is -mainly due to more attention blocks and a larger cross-attention context -as SDXL uses a second text encoder. The authors design multiple novel -conditioning schemes and train SDXL on multiple aspect ratios and also -introduce a refinement model that is used to improve the visual fidelity -of samples generated by SDXL using a post-hoc image-to-image technique. -The testing of SDXL shows drastically improved performance compared to -the previous versions of Stable Diffusion and achieves results -competitive with those of black-box state-of-the-art image generators. - -In this tutorial, we consider how to run the SDXL model using OpenVINO. - -We will use a pre-trained model from the `Hugging Face -Diffusers `__ library. To -simplify the user experience, the `Hugging Face Optimum -Intel `__ library is -used to convert the models to OpenVINO™ IR format. - -The tutorial consists of the following steps: - -- Install prerequisites -- Download the Stable Diffusion XL Base model from a public source - using the `OpenVINO integration with Hugging Face - Optimum `__. -- Run Text2Image generation pipeline using Stable Diffusion XL base -- Run Image2Image generation pipeline using Stable Diffusion XL base - -.. - - **Note**: Some demonstrated models can require at least 64GB RAM for - conversion and running. - - -**Table of contents:** - - -- `Install prerequisites <#install-prerequisites>`__ -- `SDXL Base model <#sdxl-base-model>`__ - - - `Select inference device SDXL Base - model <#select-inference-device-sdxl-base-model>`__ - - `Run Text2Image generation - pipeline <#run-text2image-generation-pipeline>`__ - - `Text2image Generation Interactive - Demo <#text2image-generation-interactive-demo>`__ - - `Run Image2Image generation - pipeline <#run-image2image-generation-pipeline>`__ - - - `Select inference device SDXL Refiner - model <#select-inference-device-sdxl-refiner-model>`__ - - - `Image2Image Generation Interactive - Demo <#image2image-generation-interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Install prerequisites ---------------------- - - - -.. code:: ipython3 - - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "torch>=2.1" "torchvision" "diffusers>=0.24.0" "invisible-watermark>=0.2.0" "transformers>=4.33.0" "accelerate" "onnx!=1.16.2" "peft>=0.6.2" - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" - %pip install -q "openvino>=2023.1.0" "gradio>=4.19" "nncf>=2.9.0" - -SDXL Base model ---------------- - - - -We will start with the base model part, which is responsible for the -generation of images of the desired output size. -`stable-diffusion-xl-base-1.0 `__ -is available for downloading via the `HuggingFace -hub `__. It already provides a -ready-to-use model in OpenVINO format compatible with `Optimum -Intel `__. - -To load an OpenVINO model and run an inference with OpenVINO Runtime, -you need to replace diffusers ``StableDiffusionXLPipeline`` with Optimum -``OVStableDiffusionXLPipeline``. In case you want to load a PyTorch -model and convert it to the OpenVINO format on the fly, you can set -``export=True``. - -You can save the model on disk using the ``save_pretrained`` method. - -.. code:: ipython3 - - from pathlib import Path - from optimum.intel.openvino import OVStableDiffusionXLPipeline - import gc - - model_id = "stabilityai/stable-diffusion-xl-base-1.0" - model_dir = Path("openvino-sd-xl-base-1.0") - - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("stable-diffusion-xl.ipynb") - - -.. parsed-literal:: - - 2024-10-17 22:53:35.107765: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-10-17 22:53:35.109501: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used. - 2024-10-17 22:53:35.146015: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-10-17 22:53:35.889441: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - - -Select inference device SDXL Base model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Please select below whether you would like to use weight compression to -reduce memory footprint. `Optimum -Intel `__ -supports weight compression via NNCF out of the box. For 8-bit -compression we provide -``quantization_config=OVWeightQuantizationConfig(bits=8, ...)`` argument -to ``from_pretrained()`` method containing number of bits and other -compression parameters. - -.. code:: ipython3 - - import ipywidgets as widgets - - compress_weights = widgets.Checkbox( - description="Apply weight compression", - value=True, - ) - - compress_weights - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Apply weight compression') - - - -.. code:: ipython3 - - if not model_dir.exists(): - !optimum-cli export openvino -m stabilityai/stable-diffusion-xl-base-1.0 --weight-format int8 {model_dir} - - text2image_pipe = OVStableDiffusionXLPipeline.from_pretrained(model_dir, device=device.value) - -Run Text2Image generation pipeline -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Now, we can run the model for the generation of images using text -prompts. To speed up evaluation and reduce the required memory we -decrease ``num_inference_steps`` and image size (using ``height`` and -``width``). You can modify them to suit your needs and depend on the -target hardware. We also specified a ``generator`` parameter based on a -numpy random state with a specific seed for results reproducibility. - -.. code:: ipython3 - - import torch - - prompt = "cute cat 4k, high-res, masterpiece, best quality, full hd, extremely detailed, soft lighting, dynamic angle, 35mm" - image = text2image_pipe( - prompt, - num_inference_steps=25, - height=512, - width=512, - generator=torch.Generator(device="cpu").manual_seed(903512), - ).images[0] - image.save("cat.png") - image - - - -.. parsed-literal:: - - 0%| | 0/25 [00:00`__ - library on MacOS. The issue may be environment dependent and may - occur on other OSes. - -`Stable Fast 3D -(SF3D) `__ is a large -reconstruction model based on -`TripoSR `__, which -takes in a single image of an object and generates a textured -UV-unwrapped 3D mesh asset. - -You can find `the source code on -GitHub `__ and read the -paper `SF3D: Stable Fast 3D Mesh Reconstruction with UV-unwrapping and -Illumination Disentanglement `__. - -.. figure:: https://github.com/Stability-AI/stable-fast-3d/blob/main/demo_files/teaser.gif?raw=true - :alt: Teaser Video - - Teaser Video - -Unlike most existing approaches, SF3D is explicitly trained for mesh -generation, incorporating a fast UV unwrapping technique that enables -swift texture generation rather than relying on vertex colors. The -method also learns to predict material parameters and normal maps to -enhance the visual quality of the reconstructed 3D meshes. - -The authors compare their results with TripoSR: - -.. figure:: https://github.com/user-attachments/assets/fb1277e5-610f-47d7-97e4-1267624f7f1f - :alt: sf3d-improvements - - sf3d-improvements - -.. - - The top shows the effect of light bake-in when relighting the asset. - SF3D produces a more plausible relighting. By not using vertex - colors, our method is capable of encoding finer details while also - having a lower polygon count. Our vertex displacement enables - estimating smooth shapes, which do not introduce stair-stepping - artifacts from marching cubes. Lastly, our material property - prediction allows us to express a variety of different surface types. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Get the original model <#get-the-original-model>`__ -- `Convert the model to OpenVINO - IR <#convert-the-model-to-openvino-ir>`__ -- `Compiling models and prepare - pipeline <#compiling-models-and-prepare-pipeline>`__ -- `Interactive inference <#interactive-inference>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - - %pip install -q "gradio>=4.19" "openvino>=2024.3.0" wheel "gradio-litmodel3d==0.0.1" - %pip install -q "torch>=2.2.2" torchvision "transformers>=4.42.3" "open_clip_torch==2.24.0" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "omegaconf==2.4.0.dev3" - %pip install -q "git+https://github.com/vork/PyNanoInstantMeshes.git" - %pip install -q jaxtyping gpytoolbox trimesh einops - # rembg requires opencv-python-headless that can't be installed with opencv-python. So, we install rembg without dependencies and install its requirements - %pip install -q rembg --no-deps - %pip install -q onnxruntime "opencv-python" pymatting pooch pynim - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("stable-fast-3d.ipynb") - -.. code:: ipython3 - - from pathlib import Path - - from cmd_helper import clone_repo - - - clone_repo("https://github.com/Stability-AI/stable-fast-3d", "2b35658e6fa41df4171f15d8696102c62845f16a") - - %pip install -q {Path("stable-fast-3d/texture_baker/")} - %pip install -q {Path("stable-fast-3d/uv_unwrapper/")} - -Get the original model ----------------------- - -.. code:: ipython3 - - from sf3d.system import SF3D - - - model = SF3D.from_pretrained( - "stabilityai/stable-fast-3d", - config_name="config.yaml", - weight_name="model.safetensors", - ) - -Convert the model to OpenVINO IR -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -SF3D is PyTorch model. OpenVINO supports PyTorch models via conversion -to OpenVINO Intermediate Representation (IR). `OpenVINO model conversion -API `__ -should be used for these purposes. ``ov.convert_model`` function accepts -original PyTorch model instance and example input for tracing and -returns ``ov.Model`` representing this model in OpenVINO framework. -Converted model can be used for saving on disk using ``ov.save_model`` -function or directly loading on device using ``core.complie_model``. -``ov_stable_fast_3d_helper.py`` script contains helper function for -model conversion, please check its content if you interested in -conversion details. - -.. raw:: html - -
- -.. raw:: html - - - -Click here for more detailed explanation of conversion steps - -.. raw:: html - - - -.. figure:: https://github.com/user-attachments/assets/8b37e08e-ddda-4dae-b5de-cf3adc4b79c8 - :alt: sf3d-overview - - sf3d-overview - -As illustrated in SF3D Overview image, SF3D has 5 main components: - -1. An enhanced transformer network that predicts higher resolution - triplanes, which helps in reducing aliasing artifacts (top left in - the figure). In this part ``LinearCameraEmbedder`` - (``camera_embedder`` in the implemented pipeline) obtains camera - embeddings for ``DINOv2`` model (``image_tokenizer``) that obtains - image tokens. ``TriplaneLearnablePositionalEmbedding`` model - (``tokenizer``) obtains triplane tokens. The transformer - ``TwoStreamInterleaveTransformer`` (``backbone``) gets triplane - tokens (``hidden_states``) and image tokens - (``encoder_hidden_states``). Then ``PixelShuffleUpsampleNetwork`` - (``post_processor``) processes the output. We will convert all these - 5 models to OpenVINO format and then replace the original models by - compiled OV-models in the original pipeline. Here is a specific for - ``DINOv2`` model that calls ``nn.functional.interpolate`` in its - method ``interpolate_pos_encoding``. This method accepts a tuple of - floats as ``scale_factor``, but during conversion a tuple of floats - converts to a tuple of tensors due to conversion specific. It raises - an error. So, we need to patch it by converting in float. - -2. Material Estimation. ``MaterialNet`` is implemented in - ``ClipBasedHeadEstimator`` model (``image_estimator``). We will - convert it too. - -3. Illumination Modeling. It is not demonstrated in the original demo - and its results are not used in the original pipeline, so we will not - use it too. Thus ``global_estimator`` is not needed to be converted. - -4. Mesh Extraction and Refinement. In these part ``MaterialMLP`` - (``decoder``) is used. The ``decoder`` accepts lists of include or - exclude heads in forward method and uses them to choose a part of - heads. We can’t accept a list of strings in IR-model, but we can - build 2 decoders with required structures. - -5. Fast UV-Unwrapping and Export. It is finalizing step and there are no - models for conversion. - -.. raw:: html - -
- -.. code:: ipython3 - - from ov_stable_fast_3d_helper import ( - convert_image_tokenizer, - convert_tokenizer, - convert_backbone, - convert_post_processor, - convert_camera_embedder, - convert_image_estimator, - convert_decoder, - ) - - # uncomment the code below to see the model conversion code of convert_image_tokenizer. - # replace the function name if you want see the code for another model - - # ??convert_image_tokenizer - -.. code:: ipython3 - - IMAGE_TOKENIZER_OV_PATH = Path("models/image_tokenizer_ir.xml") - TOKENIZER_OV_PATH = Path("models/tokenizer_ir.xml") - BACKBONE_OV_PATH = Path("models/backbone_ir.xml") - POST_PROCESSOR_OV_PATH = Path("models/post_processor_ir.xml") - CAMERA_EMBEDDER_OV_PATH = Path("models/camera_embedder_ir.xml") - IMAGE_ESTIMATOR_OV_PATH = Path("models/image_estimator_ir.xml") - INCLUDE_DECODER_OV_PATH = Path("models/include_decoder_ir.xml") - EXCLUDE_DECODER_OV_PATH = Path("models/exclude_decoder_ir.xml") - - - convert_image_tokenizer(model.image_tokenizer, IMAGE_TOKENIZER_OV_PATH) - convert_tokenizer(model.tokenizer, TOKENIZER_OV_PATH) - convert_backbone(model.backbone, BACKBONE_OV_PATH) - convert_post_processor(model.post_processor, POST_PROCESSOR_OV_PATH) - convert_camera_embedder(model.camera_embedder, CAMERA_EMBEDDER_OV_PATH) - convert_image_estimator(model.image_estimator, IMAGE_ESTIMATOR_OV_PATH) - convert_decoder(model.decoder, INCLUDE_DECODER_OV_PATH, EXCLUDE_DECODER_OV_PATH) - -Compiling models and prepare pipeline -------------------------------------- - - - -Select device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - -``get_compiled_model`` function defined in ``ov_ov_stable_fast_3d.py`` -provides convenient way for getting compiled ov-model that is compatible -with the original interface. It accepts the original model, inference -device and directories with converted models as arguments. - -.. code:: ipython3 - - from ov_stable_fast_3d_helper import get_compiled_model - - - model = get_compiled_model( - model, - device, - IMAGE_TOKENIZER_OV_PATH, - TOKENIZER_OV_PATH, - BACKBONE_OV_PATH, - POST_PROCESSOR_OV_PATH, - CAMERA_EMBEDDER_OV_PATH, - IMAGE_ESTIMATOR_OV_PATH, - INCLUDE_DECODER_OV_PATH, - EXCLUDE_DECODER_OV_PATH, - ) - -Interactive inference ---------------------- - -It’s taken from the original -``gradio_app.py``, but the model is replaced with the one defined above. - -.. code:: ipython3 - - import requests - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/stable-fast-3d/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(model=model) - - try: - demo.launch(debug=True) - except Exception: - demo.launch(share=True, debug=True) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/stable-video-diffusion-with-output.rst b/docs/notebooks/stable-video-diffusion-with-output.rst deleted file mode 100644 index 71aab77cefd39b..00000000000000 --- a/docs/notebooks/stable-video-diffusion-with-output.rst +++ /dev/null @@ -1,1408 +0,0 @@ -Image to Video Generation with Stable Video Diffusion -===================================================== - -Stable Video Diffusion (SVD) Image-to-Video is a diffusion model that -takes in a still image as a conditioning frame, and generates a video -from it. In this tutorial we consider how to convert and run Stable -Video Diffusion using OpenVINO. We will use -`stable-video-diffusion-img2video-xt `__ -model as example. Additionally, to speedup video generation process we -apply `AnimateLCM `__ LoRA weights and -run optimization with -`NNCF `__. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Download PyTorch Model <#download-pytorch-model>`__ -- `Convert Model to OpenVINO Intermediate - Representation <#convert-model-to-openvino-intermediate-representation>`__ - - - `Image Encoder <#image-encoder>`__ - - `U-net <#u-net>`__ - - `VAE Encoder and Decoder <#vae-encoder-and-decoder>`__ - -- `Prepare Inference Pipeline <#prepare-inference-pipeline>`__ -- `Run Video Generation <#run-video-generation>`__ - - - `Select Inference Device <#select-inference-device>`__ - -- `Quantization <#quantization>`__ - - - `Prepare calibration dataset <#prepare-calibration-dataset>`__ - - `Run Hybrid Model Quantization <#run-hybrid-model-quantization>`__ - - `Run Weight Compression <#run-weight-compression>`__ - - `Compare model file sizes <#compare-model-file-sizes>`__ - - `Compare inference time of the FP16 and INT8 - pipelines <#compare-inference-time-of-the-fp16-and-int8-pipelines>`__ - -- `Interactive Demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "torch>=2.1" "diffusers>=0.25" "peft>=0.6.2" "transformers" "openvino>=2024.1.0" Pillow opencv-python tqdm "gradio>=4.19" safetensors --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q datasets "nncf>=2.10.0" - -Download PyTorch Model ----------------------- - - - -The code below load Stable Video Diffusion XT model using -`Diffusers `__ library and -apply Consistency Distilled AnimateLCM weights. - -.. code:: ipython3 - - import torch - from pathlib import Path - from diffusers import StableVideoDiffusionPipeline - from diffusers.utils import load_image, export_to_video - from diffusers.models.attention_processor import AttnProcessor - from safetensors import safe_open - import gc - import requests - - - if not Path("lcm_scheduler.py").exists(): - lcm_scheduler_url = "https://huggingface.co/spaces/wangfuyun/AnimateLCM-SVD/raw/main/lcm_scheduler.py" - - r = requests.get(lcm_scheduler_url) - - with open("lcm_scheduler.py", "w") as f: - f.write(r.text) - - from lcm_scheduler import AnimateLCMSVDStochasticIterativeScheduler - from huggingface_hub import hf_hub_download - - MODEL_DIR = Path("model") - - IMAGE_ENCODER_PATH = MODEL_DIR / "image_encoder.xml" - VAE_ENCODER_PATH = MODEL_DIR / "vae_encoder.xml" - VAE_DECODER_PATH = MODEL_DIR / "vae_decoder.xml" - UNET_PATH = MODEL_DIR / "unet.xml" - - - load_pt_pipeline = not (VAE_ENCODER_PATH.exists() and VAE_DECODER_PATH.exists() and UNET_PATH.exists() and IMAGE_ENCODER_PATH.exists()) - - unet, vae, image_encoder = None, None, None - if load_pt_pipeline: - noise_scheduler = AnimateLCMSVDStochasticIterativeScheduler( - num_train_timesteps=40, - sigma_min=0.002, - sigma_max=700.0, - sigma_data=1.0, - s_noise=1.0, - rho=7, - clip_denoised=False, - ) - pipe = StableVideoDiffusionPipeline.from_pretrained( - "stabilityai/stable-video-diffusion-img2vid-xt", - variant="fp16", - scheduler=noise_scheduler, - ) - pipe.unet.set_attn_processor(AttnProcessor()) - hf_hub_download( - repo_id="wangfuyun/AnimateLCM-SVD-xt", - filename="AnimateLCM-SVD-xt.safetensors", - local_dir="./checkpoints", - ) - state_dict = {} - LCM_LORA_PATH = Path( - "checkpoints/AnimateLCM-SVD-xt.safetensors", - ) - with safe_open(LCM_LORA_PATH, framework="pt", device="cpu") as f: - for key in f.keys(): - state_dict[key] = f.get_tensor(key) - missing, unexpected = pipe.unet.load_state_dict(state_dict, strict=True) - - pipe.scheduler.save_pretrained(MODEL_DIR / "scheduler") - pipe.feature_extractor.save_pretrained(MODEL_DIR / "feature_extractor") - unet = pipe.unet - unet.eval() - vae = pipe.vae - vae.eval() - image_encoder = pipe.image_encoder - image_encoder.eval() - del pipe - gc.collect() - - # Load the conditioning image - - image_path = Path("rocket.png") - if not image_path.exists(): - image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png?download=true") - image = image.resize((512, 256)) - image.save(image_path) - else: - image = load_image(image_path) - -Convert Model to OpenVINO Intermediate Representation ------------------------------------------------------ - - - -OpenVINO supports PyTorch models via conversion into Intermediate -Representation (IR) format. We need to provide a model object, input -data for model tracing to ``ov.convert_model`` function to obtain -OpenVINO ``ov.Model`` object instance. Model can be saved on disk for -next deployment using ``ov.save_model`` function. - -Stable Video Diffusion consists of 3 parts: - -- **Image Encoder** for extraction embeddings from the input image. -- **U-Net** for step-by-step denoising video clip. -- **VAE** for encoding input image into latent space and decoding - generated video. - -Let’s convert each part. - -Image Encoder -~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - import openvino as ov - - - def cleanup_torchscript_cache(): - """ - Helper for removing cached model representation - """ - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - - - if not IMAGE_ENCODER_PATH.exists(): - with torch.no_grad(): - ov_model = ov.convert_model( - image_encoder, - example_input=torch.zeros((1, 3, 224, 224)), - input=[-1, 3, 224, 224], - ) - ov.save_model(ov_model, IMAGE_ENCODER_PATH) - del ov_model - cleanup_torchscript_cache() - print(f"Image Encoder successfully converted to IR and saved to {IMAGE_ENCODER_PATH}") - del image_encoder - gc.collect(); - -U-net -~~~~~ - - - -.. code:: ipython3 - - if not UNET_PATH.exists(): - unet_inputs = { - "sample": torch.ones([2, 2, 8, 32, 32]), - "timestep": torch.tensor(1.256), - "encoder_hidden_states": torch.zeros([2, 1, 1024]), - "added_time_ids": torch.ones([2, 3]), - } - with torch.no_grad(): - ov_model = ov.convert_model(unet, example_input=unet_inputs) - ov.save_model(ov_model, UNET_PATH) - del ov_model - cleanup_torchscript_cache() - print(f"UNet successfully converted to IR and saved to {UNET_PATH}") - - del unet - gc.collect(); - -VAE Encoder and Decoder -~~~~~~~~~~~~~~~~~~~~~~~ - - - -As discussed above VAE model used for encoding initial image and -decoding generated video. Encoding and Decoding happen on different -pipeline stages, so for convenient usage we separate VAE on 2 parts: -Encoder and Decoder. - -.. code:: ipython3 - - class VAEEncoderWrapper(torch.nn.Module): - def __init__(self, vae): - super().__init__() - self.vae = vae - - def forward(self, image): - return self.vae.encode(x=image)["latent_dist"].sample() - - - class VAEDecoderWrapper(torch.nn.Module): - def __init__(self, vae): - super().__init__() - self.vae = vae - - def forward(self, latents, num_frames: int): - return self.vae.decode(latents, num_frames=num_frames) - - - if not VAE_ENCODER_PATH.exists(): - vae_encoder = VAEEncoderWrapper(vae) - with torch.no_grad(): - ov_model = ov.convert_model(vae_encoder, example_input=torch.zeros((1, 3, 576, 1024))) - ov.save_model(ov_model, VAE_ENCODER_PATH) - cleanup_torchscript_cache() - print(f"VAE Encoder successfully converted to IR and saved to {VAE_ENCODER_PATH}") - del vae_encoder - gc.collect() - - if not VAE_DECODER_PATH.exists(): - vae_decoder = VAEDecoderWrapper(vae) - with torch.no_grad(): - ov_model = ov.convert_model(vae_decoder, example_input=(torch.zeros((8, 4, 72, 128)), torch.tensor(8))) - ov.save_model(ov_model, VAE_DECODER_PATH) - cleanup_torchscript_cache() - print(f"VAE Decoder successfully converted to IR and saved to {VAE_ENCODER_PATH}") - del vae_decoder - gc.collect() - - del vae - gc.collect(); - -Prepare Inference Pipeline --------------------------- - - - -The code bellow implements ``OVStableVideoDiffusionPipeline`` class for -running video generation using OpenVINO. The pipeline accepts input -image and returns the sequence of generated frames The diagram below -represents a simplified pipeline workflow. - -.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/a5671c5b-415b-4ae0-be82-9bf36527d452 - :alt: svd - - svd - -The pipeline is very similar to `Stable Diffusion Image to Image -Generation -pipeline `__ -with the only difference that Image Encoder is used instead of Text -Encoder. Model takes input image and random seed as initial prompt. Then -image encoded into embeddings space using Image Encoder and into latent -space using VAE Encoder and passed as input to U-Net model. Next, the -U-Net iteratively *denoises* the random latent video representations -while being conditioned on the image embeddings. The output of the -U-Net, being the noise residual, is used to compute a denoised latent -image representation via a scheduler algorithm for next iteration in -generation cycle. This process repeats the given number of times and, -finally, VAE decoder converts denoised latents into sequence of video -frames. - -.. code:: ipython3 - - from diffusers.pipelines.pipeline_utils import DiffusionPipeline - import PIL.Image - from diffusers.image_processor import VaeImageProcessor - from diffusers.utils.torch_utils import randn_tensor - from typing import Callable, Dict, List, Optional, Union - from diffusers.pipelines.stable_video_diffusion import ( - StableVideoDiffusionPipelineOutput, - ) - - - def _append_dims(x, target_dims): - """Appends dimensions to the end of a tensor until it has target_dims dimensions.""" - dims_to_append = target_dims - x.ndim - if dims_to_append < 0: - raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less") - return x[(...,) + (None,) * dims_to_append] - - - def tensor2vid(video: torch.Tensor, processor, output_type="np"): - # Based on: - # https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78 - - batch_size, channels, num_frames, height, width = video.shape - outputs = [] - for batch_idx in range(batch_size): - batch_vid = video[batch_idx].permute(1, 0, 2, 3) - batch_output = processor.postprocess(batch_vid, output_type) - - outputs.append(batch_output) - - return outputs - - - class OVStableVideoDiffusionPipeline(DiffusionPipeline): - r""" - Pipeline to generate video from an input image using Stable Video Diffusion. - - This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods - implemented for all pipelines (downloading, saving, running on a particular device, etc.). - - Args: - vae ([`AutoencoderKL`]): - Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. - image_encoder ([`~transformers.CLIPVisionModelWithProjection`]): - Frozen CLIP image-encoder ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)). - unet ([`UNetSpatioTemporalConditionModel`]): - A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents. - scheduler ([`EulerDiscreteScheduler`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latents. - feature_extractor ([`~transformers.CLIPImageProcessor`]): - A `CLIPImageProcessor` to extract features from generated images. - """ - - def __init__( - self, - vae_encoder, - image_encoder, - unet, - vae_decoder, - scheduler, - feature_extractor, - ): - super().__init__() - self.vae_encoder = vae_encoder - self.vae_decoder = vae_decoder - self.image_encoder = image_encoder - self.register_to_config(unet=unet) - self.scheduler = scheduler - self.feature_extractor = feature_extractor - self.vae_scale_factor = 2 ** (4 - 1) - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - - def _encode_image(self, image, device, num_videos_per_prompt, do_classifier_free_guidance): - dtype = torch.float32 - - if not isinstance(image, torch.Tensor): - image = self.image_processor.pil_to_numpy(image) - image = self.image_processor.numpy_to_pt(image) - - # We normalize the image before resizing to match with the original implementation. - # Then we unnormalize it after resizing. - image = image * 2.0 - 1.0 - image = _resize_with_antialiasing(image, (224, 224)) - image = (image + 1.0) / 2.0 - - # Normalize the image with for CLIP input - image = self.feature_extractor( - images=image, - do_normalize=True, - do_center_crop=False, - do_resize=False, - do_rescale=False, - return_tensors="pt", - ).pixel_values - - image = image.to(device=device, dtype=dtype) - image_embeddings = torch.from_numpy(self.image_encoder(image)[0]) - image_embeddings = image_embeddings.unsqueeze(1) - - # duplicate image embeddings for each generation per prompt, using mps friendly method - bs_embed, seq_len, _ = image_embeddings.shape - image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1) - image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1) - - if do_classifier_free_guidance: - negative_image_embeddings = torch.zeros_like(image_embeddings) - - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - image_embeddings = torch.cat([negative_image_embeddings, image_embeddings]) - return image_embeddings - - def _encode_vae_image( - self, - image: torch.Tensor, - device, - num_videos_per_prompt, - do_classifier_free_guidance, - ): - image_latents = torch.from_numpy(self.vae_encoder(image)[0]) - - if do_classifier_free_guidance: - negative_image_latents = torch.zeros_like(image_latents) - - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - image_latents = torch.cat([negative_image_latents, image_latents]) - - # duplicate image_latents for each generation per prompt, using mps friendly method - image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1) - - return image_latents - - def _get_add_time_ids( - self, - fps, - motion_bucket_id, - noise_aug_strength, - dtype, - batch_size, - num_videos_per_prompt, - do_classifier_free_guidance, - ): - add_time_ids = [fps, motion_bucket_id, noise_aug_strength] - - passed_add_embed_dim = 256 * len(add_time_ids) - expected_add_embed_dim = 3 * 256 - - if expected_add_embed_dim != passed_add_embed_dim: - raise ValueError( - f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`." - ) - - add_time_ids = torch.tensor([add_time_ids], dtype=dtype) - add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1) - - if do_classifier_free_guidance: - add_time_ids = torch.cat([add_time_ids, add_time_ids]) - - return add_time_ids - - def decode_latents(self, latents, num_frames, decode_chunk_size=14): - # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width] - latents = latents.flatten(0, 1) - - latents = 1 / 0.18215 * latents - - # decode decode_chunk_size frames at a time to avoid OOM - frames = [] - for i in range(0, latents.shape[0], decode_chunk_size): - frame = torch.from_numpy(self.vae_decoder([latents[i : i + decode_chunk_size], num_frames])[0]) - frames.append(frame) - frames = torch.cat(frames, dim=0) - - # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width] - frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4) - - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - frames = frames.float() - return frames - - def check_inputs(self, image, height, width): - if not isinstance(image, torch.Tensor) and not isinstance(image, PIL.Image.Image) and not isinstance(image, list): - raise ValueError("`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}") - - if height % 8 != 0 or width % 8 != 0: - raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - - def prepare_latents( - self, - batch_size, - num_frames, - num_channels_latents, - height, - width, - dtype, - device, - generator, - latents=None, - ): - shape = ( - batch_size, - num_frames, - num_channels_latents // 2, - height // self.vae_scale_factor, - width // self.vae_scale_factor, - ) - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - - if latents is None: - latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) - else: - latents = latents.to(device) - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * self.scheduler.init_noise_sigma - return latents - - @torch.no_grad() - def __call__( - self, - image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor], - height: int = 320, - width: int = 512, - num_frames: Optional[int] = 8, - num_inference_steps: int = 4, - min_guidance_scale: float = 1.0, - max_guidance_scale: float = 1.2, - fps: int = 7, - motion_bucket_id: int = 80, - noise_aug_strength: int = 0.01, - decode_chunk_size: Optional[int] = None, - num_videos_per_prompt: Optional[int] = 1, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - output_type: Optional[str] = "pil", - callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, - callback_on_step_end_tensor_inputs: List[str] = ["latents"], - return_dict: bool = True, - ): - r""" - The call function to the pipeline for generation. - - Args:discussed - image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`): - Image or images to guide image generation. If you provide a tensor, it needs to be compatible with - [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json). - height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): - The height in pixels of the generated image. - width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): - The width in pixels of the generated image. - num_frames (`int`, *optional*): - The number of video frames to generate. Defaults to 14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt` - num_inference_steps (`int`, *optional*, defaults to 25): - - - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. This parameter is modulated by `strength`. - min_guidance_scale (`float`, *optional*, defaults to 1.0): - The minimum guidance scale. Used for the classifier free guidance with first frame. - max_guidance_scale (`float`, *optional*, defaults to 3.0): - The maximum guidance scale. Used for the classifier free guidance with last frame. - fps (`int`, *optional*, defaults to 7): - Frames per second. The rate at which the generated images shall be exported to a video after generation. - Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training. - motion_bucket_id (`int`, *optional*, defaults to 127): - The motion bucket ID. Used as conditioning for the generation. The higher the number the more motion will be in the video. - noise_aug_strength (`int`, *optional*, defaults to 0.02): - The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion. - decode_chunk_size (`int`, *optional*): - The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency - between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once - for maximal quality. Reduce `decode_chunk_size` to reduce memory usage. - num_videos_per_prompt (`int`, *optional*, defaults to 1): - The number of images to generate per prompt. - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make - generation deterministic. - latents (`torch.FloatTensor`, *optional*): - Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor is generated by sampling using the supplied random `generator`. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated image. Choose between `PIL.Image` or `np.array`. - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. - callback_on_step_end_tensor_inputs (`List`, *optional*): - The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list - will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeline class. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - - Returns: - [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned, - otherwise a `tuple` is returned where the first element is a list of list with the generated frames. - - Examples: - - ```py - from diffusers import StableVideoDiffusionPipeline - from diffusers.utils import load_image, export_to_video - - pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16") - pipe.to("cuda") - - image = load_image("https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200") - image = image.resize((1024, 576)) - - frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0] - export_to_video(frames, "generated.mp4", fps=7) - ``` - """ - # 0. Default height and width to unet - height = height or 96 * self.vae_scale_factor - width = width or 96 * self.vae_scale_factor - - num_frames = num_frames if num_frames is not None else 25 - decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames - - # 1. Check inputs. Raise error if not correct - self.check_inputs(image, height, width) - - # 2. Define call parameters - if isinstance(image, PIL.Image.Image): - batch_size = 1 - elif isinstance(image, list): - batch_size = len(image) - else: - batch_size = image.shape[0] - device = torch.device("cpu") - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = max_guidance_scale > 1.0 - - # 3. Encode input image - image_embeddings = self._encode_image(image, device, num_videos_per_prompt, do_classifier_free_guidance) - - # NOTE: Stable Diffusion Video was conditioned on fps - 1, which - # is why it is reduced here. - # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188 - fps = fps - 1 - - # 4. Encode input image using VAE - image = self.image_processor.preprocess(image, height=height, width=width) - noise = randn_tensor(image.shape, generator=generator, device=image.device, dtype=image.dtype) - image = image + noise_aug_strength * noise - - image_latents = self._encode_vae_image(image, device, num_videos_per_prompt, do_classifier_free_guidance) - image_latents = image_latents.to(image_embeddings.dtype) - - # Repeat the image latents for each frame so we can concatenate them with the noise - # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width] - image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1) - - # 5. Get Added Time IDs - added_time_ids = self._get_add_time_ids( - fps, - motion_bucket_id, - noise_aug_strength, - image_embeddings.dtype, - batch_size, - num_videos_per_prompt, - do_classifier_free_guidance, - ) - added_time_ids = added_time_ids - - # 4. Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps = self.scheduler.timesteps - # 5. Prepare latent variables - num_channels_latents = 8 - latents = self.prepare_latents( - batch_size * num_videos_per_prompt, - num_frames, - num_channels_latents, - height, - width, - image_embeddings.dtype, - device, - generator, - latents, - ) - - # 7. Prepare guidance scale - guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0) - guidance_scale = guidance_scale.to(device, latents.dtype) - guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1) - guidance_scale = _append_dims(guidance_scale, latents.ndim) - - # 8. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - num_timesteps = len(timesteps) - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - - # Concatenate image_latents over channels dimention - latent_model_input = torch.cat([latent_model_input, image_latents], dim=2) - # predict the noise residual - noise_pred = torch.from_numpy( - self.unet( - [ - latent_model_input, - t, - image_embeddings, - added_time_ids, - ] - )[0] - ) - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents).prev_sample - - if callback_on_step_end is not None: - callback_kwargs = {} - for k in callback_on_step_end_tensor_inputs: - callback_kwargs[k] = locals()[k] - callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) - - latents = callback_outputs.pop("latents", latents) - - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - - if not output_type == "latent": - frames = self.decode_latents(latents, num_frames, decode_chunk_size) - frames = tensor2vid(frames, self.image_processor, output_type=output_type) - else: - frames = latents - - if not return_dict: - return frames - - return StableVideoDiffusionPipelineOutput(frames=frames) - - - # resizing utils - def _resize_with_antialiasing(input, size, interpolation="bicubic", align_corners=True): - h, w = input.shape[-2:] - factors = (h / size[0], w / size[1]) - - # First, we have to determine sigma - # Taken from skimage: https://github.com/scikit-image/scikit-image/blob/v0.19.2/skimage/transform/_warps.py#L171 - sigmas = ( - max((factors[0] - 1.0) / 2.0, 0.001), - max((factors[1] - 1.0) / 2.0, 0.001), - ) - # Now kernel size. Good results are for 3 sigma, but that is kind of slow. Pillow uses 1 sigma - # https://github.com/python-pillow/Pillow/blob/master/src/libImaging/Resample.c#L206 - # But they do it in the 2 passes, which gives better results. Let's try 2 sigmas for now - ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3)) - - # Make sure it is odd - if (ks[0] % 2) == 0: - ks = ks[0] + 1, ks[1] - - if (ks[1] % 2) == 0: - ks = ks[0], ks[1] + 1 - - input = _gaussian_blur2d(input, ks, sigmas) - - output = torch.nn.functional.interpolate(input, size=size, mode=interpolation, align_corners=align_corners) - return output - - - def _compute_padding(kernel_size): - """Compute padding tuple.""" - # 4 or 6 ints: (padding_left, padding_right,padding_top,padding_bottom) - # https://pytorch.org/docs/stable/nn.html#torch.nn.functional.pad - if len(kernel_size) < 2: - raise AssertionError(kernel_size) - computed = [k - 1 for k in kernel_size] - - # for even kernels we need to do asymmetric padding :( - out_padding = 2 * len(kernel_size) * [0] - - for i in range(len(kernel_size)): - computed_tmp = computed[-(i + 1)] - - pad_front = computed_tmp // 2 - pad_rear = computed_tmp - pad_front - - out_padding[2 * i + 0] = pad_front - out_padding[2 * i + 1] = pad_rear - - return out_padding - - - def _filter2d(input, kernel): - # prepare kernel - b, c, h, w = input.shape - tmp_kernel = kernel[:, None, ...].to(device=input.device, dtype=input.dtype) - - tmp_kernel = tmp_kernel.expand(-1, c, -1, -1) - - height, width = tmp_kernel.shape[-2:] - - padding_shape: list[int] = _compute_padding([height, width]) - input = torch.nn.functional.pad(input, padding_shape, mode="reflect") - - # kernel and input tensor reshape to align element-wise or batch-wise params - tmp_kernel = tmp_kernel.reshape(-1, 1, height, width) - input = input.view(-1, tmp_kernel.size(0), input.size(-2), input.size(-1)) - - # convolve the tensor with the kernel. - output = torch.nn.functional.conv2d(input, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1) - - out = output.view(b, c, h, w) - return out - - - def _gaussian(window_size: int, sigma): - if isinstance(sigma, float): - sigma = torch.tensor([[sigma]]) - - batch_size = sigma.shape[0] - - x = (torch.arange(window_size, device=sigma.device, dtype=sigma.dtype) - window_size // 2).expand(batch_size, -1) - - if window_size % 2 == 0: - x = x + 0.5 - - gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0))) - - return gauss / gauss.sum(-1, keepdim=True) - - - def _gaussian_blur2d(input, kernel_size, sigma): - if isinstance(sigma, tuple): - sigma = torch.tensor([sigma], dtype=input.dtype) - else: - sigma = sigma.to(dtype=input.dtype) - - ky, kx = int(kernel_size[0]), int(kernel_size[1]) - bs = sigma.shape[0] - kernel_x = _gaussian(kx, sigma[:, 1].view(bs, 1)) - kernel_y = _gaussian(ky, sigma[:, 0].view(bs, 1)) - out_x = _filter2d(input, kernel_x[..., None, :]) - out = _filter2d(out_x, kernel_y[..., None]) - - return out - -Run Video Generation --------------------- - - - -Select Inference Device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - import requests - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("stable-video-diffusion.ipynb") - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - from transformers import CLIPImageProcessor - - core = ov.Core() - - vae_encoder = core.compile_model(VAE_ENCODER_PATH, device.value) - image_encoder = core.compile_model(IMAGE_ENCODER_PATH, device.value) - unet = core.compile_model(UNET_PATH, device.value) - vae_decoder = core.compile_model(VAE_DECODER_PATH, device.value) - scheduler = AnimateLCMSVDStochasticIterativeScheduler.from_pretrained(MODEL_DIR / "scheduler") - feature_extractor = CLIPImageProcessor.from_pretrained(MODEL_DIR / "feature_extractor") - -Now, let’s see model in action. > Please, note, video generation is -memory and time consuming process. For reducing memory consumption, we -decreased input video resolution to 576x320 and number of generated -frames that may affect quality of generated video. You can change these -settings manually providing ``height``, ``width`` and ``num_frames`` -parameters into pipeline. - -.. code:: ipython3 - - ov_pipe = OVStableVideoDiffusionPipeline(vae_encoder, image_encoder, unet, vae_decoder, scheduler, feature_extractor) - -.. code:: ipython3 - - frames = ov_pipe( - image, - num_inference_steps=4, - motion_bucket_id=60, - num_frames=8, - height=320, - width=512, - generator=torch.manual_seed(12342), - ).frames[0] - -.. code:: ipython3 - - out_path = Path("generated.mp4") - - export_to_video(frames, str(out_path), fps=7) - frames[0].save( - "generated.gif", - save_all=True, - append_images=frames[1:], - optimize=False, - duration=120, - loop=0, - ) - -.. code:: ipython3 - - from IPython.display import HTML - - HTML('') - - - - -.. raw:: html - - - - - -Quantization ------------- - - - -`NNCF `__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -According to ``OVStableVideoDiffusionPipeline`` structure, the diffusion -model takes up significant portion of the overall pipeline execution -time. Now we will show you how to optimize the UNet part using -`NNCF `__ to reduce -computation cost and speed up the pipeline. Quantizing the rest of the -pipeline does not significantly improve inference performance but can -lead to a substantial degradation of accuracy. That’s why we use only -weight compression for the ``vae encoder`` and ``vae decoder`` to reduce -the memory footprint. - -For the UNet model we apply quantization in hybrid mode which means that -we quantize: (1) weights of MatMul and Embedding layers and (2) -activations of other layers. The steps are the following: - -1. Create a calibration dataset for quantization. -2. Collect operations with weights. -3. Run ``nncf.compress_model()`` to compress only the model weights. -4. Run ``nncf.quantize()`` on the compressed model with weighted - operations ignored by providing ``ignored_scope`` parameter. -5. Save the ``INT8`` model using ``openvino.save_model()`` function. - -Please select below whether you would like to run quantization to -improve model inference speed. - - **NOTE**: Quantization is time and memory consuming operation. - Running quantization code below may take some time. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - to_quantize = quantization_widget() - - to_quantize - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - import requests - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - ov_int8_pipeline = None - OV_INT8_UNET_PATH = MODEL_DIR / "unet_int8.xml" - OV_INT8_VAE_ENCODER_PATH = MODEL_DIR / "vae_encoder_int8.xml" - OV_INT8_VAE_DECODER_PATH = MODEL_DIR / "vae_decoder_int8.xml" - - %load_ext skip_kernel_extension - -Prepare calibration dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We use a portion of -`fusing/instructpix2pix-1000-samples `__ -dataset from Hugging Face as calibration data. To collect intermediate -model inputs for UNet optimization we should customize -``CompiledModel``. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from typing import Any - - import datasets - import numpy as np - from tqdm.notebook import tqdm - from IPython.utils import io - - - class CompiledModelDecorator(ov.CompiledModel): - def __init__(self, compiled_model: ov.CompiledModel, data_cache: List[Any] = None, keep_prob: float = 0.5): - super().__init__(compiled_model) - self.data_cache = data_cache if data_cache is not None else [] - self.keep_prob = keep_prob - - def __call__(self, *args, **kwargs): - if np.random.rand() <= self.keep_prob: - self.data_cache.append(*args) - return super().__call__(*args, **kwargs) - - - def collect_calibration_data(ov_pipe, calibration_dataset_size: int, num_inference_steps: int = 50) -> List[Dict]: - original_unet = ov_pipe.unet - calibration_data = [] - ov_pipe.unet = CompiledModelDecorator(original_unet, calibration_data, keep_prob=1) - - dataset = datasets.load_dataset("fusing/instructpix2pix-1000-samples", split="train", streaming=False).shuffle(seed=42) - # Run inference for data collection - pbar = tqdm(total=calibration_dataset_size) - for batch in dataset: - image = batch["input_image"] - - with io.capture_output() as captured: - ov_pipe( - image, - num_inference_steps=4, - motion_bucket_id=60, - num_frames=8, - height=256, - width=256, - generator=torch.manual_seed(12342), - ) - pbar.update(len(calibration_data) - pbar.n) - if len(calibration_data) >= calibration_dataset_size: - break - - ov_pipe.unet = original_unet - return calibration_data[:calibration_dataset_size] - -.. code:: ipython3 - - %%skip not $to_quantize.value - - if not OV_INT8_UNET_PATH.exists(): - subset_size = 200 - calibration_data = collect_calibration_data(ov_pipe, calibration_dataset_size=subset_size) - -Run Hybrid Model Quantization -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from collections import deque - - def get_operation_const_op(operation, const_port_id: int): - node = operation.input_value(const_port_id).get_node() - queue = deque([node]) - constant_node = None - allowed_propagation_types_list = ["Convert", "FakeQuantize", "Reshape"] - - while len(queue) != 0: - curr_node = queue.popleft() - if curr_node.get_type_name() == "Constant": - constant_node = curr_node - break - if len(curr_node.inputs()) == 0: - break - if curr_node.get_type_name() in allowed_propagation_types_list: - queue.append(curr_node.input_value(0).get_node()) - - return constant_node - - - def is_embedding(node) -> bool: - allowed_types_list = ["f16", "f32", "f64"] - const_port_id = 0 - input_tensor = node.input_value(const_port_id) - if input_tensor.get_element_type().get_type_name() in allowed_types_list: - const_node = get_operation_const_op(node, const_port_id) - if const_node is not None: - return True - - return False - - - def collect_ops_with_weights(model): - ops_with_weights = [] - for op in model.get_ops(): - if op.get_type_name() == "MatMul": - constant_node_0 = get_operation_const_op(op, const_port_id=0) - constant_node_1 = get_operation_const_op(op, const_port_id=1) - if constant_node_0 or constant_node_1: - ops_with_weights.append(op.get_friendly_name()) - if op.get_type_name() == "Gather" and is_embedding(op): - ops_with_weights.append(op.get_friendly_name()) - - return ops_with_weights - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import nncf - import logging - from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters - - nncf.set_log_level(logging.ERROR) - - if not OV_INT8_UNET_PATH.exists(): - diffusion_model = core.read_model(UNET_PATH) - unet_ignored_scope = collect_ops_with_weights(diffusion_model) - compressed_diffusion_model = nncf.compress_weights(diffusion_model, ignored_scope=nncf.IgnoredScope(types=['Convolution'])) - quantized_diffusion_model = nncf.quantize( - model=compressed_diffusion_model, - calibration_dataset=nncf.Dataset(calibration_data), - subset_size=subset_size, - model_type=nncf.ModelType.TRANSFORMER, - # We additionally ignore the first convolution to improve the quality of generations - ignored_scope=nncf.IgnoredScope(names=unet_ignored_scope + ["__module.conv_in/aten::_convolution/Convolution"]), - advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alphas=AdvancedSmoothQuantParameters(matmul=-1)) - ) - ov.save_model(quantized_diffusion_model, OV_INT8_UNET_PATH) - -Run Weight Compression -~~~~~~~~~~~~~~~~~~~~~~ - - - -Quantizing of the ``vae encoder`` and ``vae decoder`` does not -significantly improve inference performance but can lead to a -substantial degradation of accuracy. Only weight compression will be -applied for footprint reduction. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - nncf.set_log_level(logging.INFO) - - if not OV_INT8_VAE_ENCODER_PATH.exists(): - text_encoder_model = core.read_model(VAE_ENCODER_PATH) - compressed_text_encoder_model = nncf.compress_weights(text_encoder_model, mode=nncf.CompressWeightsMode.INT4_SYM, group_size=64) - ov.save_model(compressed_text_encoder_model, OV_INT8_VAE_ENCODER_PATH) - - if not OV_INT8_VAE_DECODER_PATH.exists(): - decoder_model = core.read_model(VAE_DECODER_PATH) - compressed_decoder_model = nncf.compress_weights(decoder_model, mode=nncf.CompressWeightsMode.INT4_SYM, group_size=64) - ov.save_model(compressed_decoder_model, OV_INT8_VAE_DECODER_PATH) - -Let’s compare the video generated by the original and optimized -pipelines. Dynamic quantization should be disabled for UNet model -because it introduces a performance overhead when applied to Diffusion -models that have been quantized using a ``Hybrid`` approach. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - ov_int8_vae_encoder = core.compile_model(OV_INT8_VAE_ENCODER_PATH, device.value) - ov_int8_unet = core.compile_model(OV_INT8_UNET_PATH, device.value, config={"DYNAMIC_QUANTIZATION_GROUP_SIZE":"0"}) - ov_int8_decoder = core.compile_model(OV_INT8_VAE_DECODER_PATH, device.value) - - ov_int8_pipeline = OVStableVideoDiffusionPipeline( - ov_int8_vae_encoder, image_encoder, ov_int8_unet, ov_int8_decoder, scheduler, feature_extractor - ) - - int8_frames = ov_int8_pipeline( - image, - num_inference_steps=4, - motion_bucket_id=60, - num_frames=8, - height=320, - width=512, - generator=torch.manual_seed(12342), - ).frames[0] - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from IPython.display import display - - int8_out_path = Path("generated_int8.mp4") - - export_to_video(int8_frames, str(int8_out_path), fps=7) - int8_frames[0].save( - "generated_int8.gif", - save_all=True, - append_images=int8_frames[1:], - optimize=False, - duration=120, - loop=0, - ) - display(HTML('')) - - - -.. raw:: html - - - - -Compare model file sizes -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - fp16_model_paths = [VAE_ENCODER_PATH, UNET_PATH, VAE_DECODER_PATH] - int8_model_paths = [OV_INT8_VAE_ENCODER_PATH, OV_INT8_UNET_PATH, OV_INT8_VAE_DECODER_PATH] - - for fp16_path, int8_path in zip(fp16_model_paths, int8_model_paths): - fp16_ir_model_size = fp16_path.with_suffix(".bin").stat().st_size - int8_model_size = int8_path.with_suffix(".bin").stat().st_size - print(f"{fp16_path.stem} compression rate: {fp16_ir_model_size / int8_model_size:.3f}") - - -.. parsed-literal:: - - vae_encoder compression rate: 2.018 - unet compression rate: 1.996 - vae_decoder compression rate: 2.007 - - -Compare inference time of the FP16 and INT8 pipelines -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -To measure the inference performance of the ``FP16`` and ``INT8`` -pipelines, we use median inference time on calibration subset. - - **NOTE**: For the most accurate performance estimation, it is - recommended to run ``benchmark_app`` in a terminal/command prompt - after closing other applications. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import time - - def calculate_inference_time(pipeline, validation_data): - inference_time = [] - for prompt in validation_data: - start = time.perf_counter() - with io.capture_output() as captured: - _ = pipeline( - image, - num_inference_steps=4, - motion_bucket_id=60, - num_frames=8, - height=320, - width=512, - generator=torch.manual_seed(12342), - ) - end = time.perf_counter() - delta = end - start - inference_time.append(delta) - return np.median(inference_time) - -.. code:: ipython3 - - %%skip not $to_quantize.value - - validation_size = 3 - validation_dataset = datasets.load_dataset("fusing/instructpix2pix-1000-samples", split="train", streaming=True).shuffle(seed=42).take(validation_size) - validation_data = [data["input_image"] for data in validation_dataset] - - fp_latency = calculate_inference_time(ov_pipe, validation_data) - int8_latency = calculate_inference_time(ov_int8_pipeline, validation_data) - print(f"Performance speed-up: {fp_latency / int8_latency:.3f}") - - -.. parsed-literal:: - - Performance speed-up: 1.243 - - -Interactive Demo ----------------- - - - -Please select below whether you would like to use the quantized model to -launch the interactive demo. - -.. code:: ipython3 - - import ipywidgets as widgets - - quantized_model_present = ov_int8_pipeline is not None - - use_quantized_model = widgets.Checkbox( - value=quantized_model_present, - description="Use quantized model", - disabled=not quantized_model_present, - ) - - use_quantized_model - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Use quantized model') - - - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/stable-video-diffusion/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - pipeline = ov_int8_pipeline if use_quantized_model.value else ov_pipe - - demo = make_demo(pipeline) - - try: - demo.queue().launch(debug=False) - except Exception: - demo.queue().launch(debug=False, share=True) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/style-transfer-with-output.rst b/docs/notebooks/style-transfer-with-output.rst deleted file mode 100644 index 0bea8ad2e71827..00000000000000 --- a/docs/notebooks/style-transfer-with-output.rst +++ /dev/null @@ -1,476 +0,0 @@ -Style Transfer with OpenVINO™ -============================= - -This notebook demonstrates style transfer with OpenVINO, using the Style -Transfer Models from `ONNX Model -Repository `__. Specifically, `Fast -Neural Style -Transfer `__ -model, which is designed to mix the content of an image with the style -of another image. - -.. figure:: https://user-images.githubusercontent.com/109281183/208703143-049f712d-2777-437c-8172-597ef7d53fc3.gif - :alt: style transfer - - style transfer - -This notebook uses five pre-trained models, for the following styles: -Mosaic, Rain Princess, Candy, Udnie and Pointilism. The models are from -`ONNX Model Repository `__ and are based -on the research paper `Perceptual Losses for Real-Time Style Transfer -and Super-Resolution `__ along with -`Instance Normalization `__. Final -part of this notebook shows live inference results from a webcam. -Additionally, you can also upload a video file. - - **NOTE**: If you have a webcam on your computer, you can see live - results streaming in the notebook. If you run the notebook on a - server, the webcam will not work but you can run inference, using a - video file. - - -**Table of contents:** - - -- `Preparation <#preparation>`__ - - - `Install requirements <#install-requirements>`__ - - `Imports <#imports>`__ - -- `The Model <#the-model>`__ - - - `Download the Model <#download-the-model>`__ - - `Convert ONNX Model to OpenVINO IR - Format <#convert-onnx-model-to-openvino-ir-format>`__ - - `Load the Model <#load-the-model>`__ - - `Preprocess the image <#preprocess-the-image>`__ - - `Helper function to postprocess the stylized - image <#helper-function-to-postprocess-the-stylized-image>`__ - - `Main Processing Function <#main-processing-function>`__ - - `Run Style Transfer <#run-style-transfer>`__ - -- `References <#references>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Preparation ------------ - - - -Install requirements -~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %pip install -q "openvino>=2023.1.0" - %pip install -q opencv-python requests tqdm - - from pathlib import Path - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("style-transfer.ipynb") - -Imports -~~~~~~~ - - - -.. code:: ipython3 - - import collections - import time - - import cv2 - import numpy as np - from pathlib import Path - import ipywidgets as widgets - from IPython.display import display, clear_output, Image - import openvino as ov - - import notebook_utils as utils - -Select one of the styles below: Mosaic, Rain Princess, Candy, Udnie, and -Pointilism to do the style transfer. - -.. code:: ipython3 - - # Option to select different styles using a dropdown - style_dropdown = widgets.Dropdown( - options=["MOSAIC", "RAIN-PRINCESS", "CANDY", "UDNIE", "POINTILISM"], - value="MOSAIC", # Set the default value - description="Select Style:", - disabled=False, - style={"description_width": "initial"}, # Adjust the width as needed - ) - - - # Function to handle changes in dropdown and print the selected style - def print_style(change): - if change["type"] == "change" and change["name"] == "value": - print(f"Selected style {change['new']}") - - - # Observe changes in the dropdown value - style_dropdown.observe(print_style, names="value") - - # Display the dropdown - display(style_dropdown) - -The Model ---------- - - - -Download the Model -~~~~~~~~~~~~~~~~~~ - - - -The style transfer model, selected in the previous step, will be -downloaded to ``model_path`` if you have not already downloaded it. The -models are provided by the ONNX Model Zoo in ``.onnx`` format, which -means it could be used with OpenVINO directly. However, this notebook -will also show how you can use the Conversion API to convert ONNX to -OpenVINO Intermediate Representation (IR) with ``FP16`` precision. - -.. code:: ipython3 - - # Directory to download the model from ONNX model zoo - base_model_dir = "model" - base_url = "https://github.com/onnx/models/raw/69d69010b7ed6ba9438c392943d2715026792d40/archive/vision/style_transfer/fast_neural_style/model" - - # Selected ONNX model will be downloaded in the path - model_path = Path(f"{style_dropdown.value.lower()}-9.onnx") - ir_path = Path(f"model/{style_dropdown.value.lower()}-9.xml") - onnx_path = Path(f"model/{model_path}") - - if not onnx_path.exists(): - style_url = f"{base_url}/{model_path}" - utils.download_file(style_url, directory=base_model_dir) - -Convert ONNX Model to OpenVINO IR Format -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -In the next step, you will convert the ONNX model to OpenVINO IR format -with ``FP16`` precision. While ONNX models are directly supported by -OpenVINO runtime, it can be useful to convert them to IR format to take -advantage of OpenVINO optimization tools and features. The -``ov.convert_model`` Python function of model conversion API can be -used. The converted model is saved to the model directory. The function -returns instance of OpenVINO Model class, which is ready to use in -Python interface but can also be serialized to OpenVINO IR format for -future execution. If the model has been already converted, you can skip -this step. - -.. code:: ipython3 - - if not ir_path.exists(): - ov_model = ov.convert_model(onnx_path) - ov.save_model(ov_model, ir_path) - -Load the Model -~~~~~~~~~~~~~~ - - - -Both the ONNX model(s) and converted IR model(s) are stored in the -``model`` directory. - -Only a few lines of code are required to run the model. First, -initialize OpenVINO Runtime. Then, read the network architecture and -model weights from the ``.bin`` and ``.xml`` files to compile for the -desired device. If you select ``GPU`` you may need to wait briefly for -it to load, as the startup time is somewhat longer than ``CPU``. - -To let OpenVINO automatically select the best device for inference just -use ``AUTO``. In most cases, the best device to use is ``GPU`` (better -performance, but slightly longer startup time). You can select one from -available devices using dropdown list below. - -OpenVINO Runtime can load ONNX models from `ONNX Model -Repository `__ directly. In such cases, -use ONNX path instead of IR model to load the model. It is recommended -to load the OpenVINO Intermediate Representation (IR) model for the best -results. - -.. code:: ipython3 - - # Initialize OpenVINO Runtime. - core = ov.Core() - - # Read the network and corresponding weights from ONNX Model. - # model = ie_core.read_model(model=onnx_path) - - # Read the network and corresponding weights from IR Model. - model = core.read_model(model=ir_path) - -.. code:: ipython3 - - device = utils.device_widget() - - - # Compile the model for CPU (or change to GPU, etc. for other devices) - # or let OpenVINO select the best available device with AUTO. - device - -.. code:: ipython3 - - compiled_model = core.compile_model(model=model, device_name=device.value) - - # Get the input and output nodes. - input_layer = compiled_model.input(0) - output_layer = compiled_model.output(0) - -Input and output layers have the names of the input node and output node -respectively. For *fast-neural-style-mosaic-onnx*, there is 1 input and -1 output with the ``(1, 3, 224, 224)`` shape. - -.. code:: ipython3 - - print(input_layer.any_name, output_layer.any_name) - print(input_layer.shape) - print(output_layer.shape) - - # Get the input size. - N, C, H, W = list(input_layer.shape) - -Preprocess the image -~~~~~~~~~~~~~~~~~~~~ - -Preprocess the input image -before running the model. Prepare the dimensions and channel order for -the image to match the original image with the input tensor - -1. Preprocess a frame to convert from ``unit8`` to ``float32``. -2. Transpose the array to match with the network input size - -.. code:: ipython3 - - # Preprocess the input image. - def preprocess_images(frame, H, W): - """ - Preprocess input image to align with network size - - Parameters: - :param frame: input frame - :param H: height of the frame to style transfer model - :param W: width of the frame to style transfer model - :returns: resized and transposed frame - """ - image = np.array(frame).astype("float32") - image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) - image = cv2.resize(src=image, dsize=(H, W), interpolation=cv2.INTER_AREA) - image = np.transpose(image, [2, 0, 1]) - image = np.expand_dims(image, axis=0) - return image - -Helper function to postprocess the stylized image -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The converted IR model outputs a NumPy ``float32`` array of the `(1, 3, -224, -224) `__ -shape . - -.. code:: ipython3 - - # Postprocess the result - def convert_result_to_image(frame, stylized_image) -> np.ndarray: - """ - Postprocess stylized image for visualization - - Parameters: - :param frame: input frame - :param stylized_image: stylized image with specific style applied - :returns: resized stylized image for visualization - """ - h, w = frame.shape[:2] - stylized_image = stylized_image.squeeze().transpose(1, 2, 0) - stylized_image = cv2.resize(src=stylized_image, dsize=(w, h), interpolation=cv2.INTER_CUBIC) - stylized_image = np.clip(stylized_image, 0, 255).astype(np.uint8) - stylized_image = cv2.cvtColor(stylized_image, cv2.COLOR_BGR2RGB) - return stylized_image - -Main Processing Function -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The style transfer function can be run in different operating modes, -either using a webcam or a video file. - -.. code:: ipython3 - - def run_style_transfer(source=0, flip=False, use_popup=False, skip_first_frames=0): - """ - Main function to run the style inference: - 1. Create a video player to play with target fps (utils.VideoPlayer). - 2. Prepare a set of frames for style transfer. - 3. Run AI inference for style transfer. - 4. Visualize the results. - Parameters: - source: The webcam number to feed the video stream with primary webcam set to "0", or the video path. - flip: To be used by VideoPlayer function for flipping capture image. - use_popup: False for showing encoded frames over this notebook, True for creating a popup window. - skip_first_frames: Number of frames to skip at the beginning of the video. - """ - # Create a video player to play with target fps. - player = None - try: - player = utils.VideoPlayer(source=source, flip=flip, fps=30, skip_first_frames=skip_first_frames) - # Start video capturing. - player.start() - if use_popup: - title = "Press ESC to Exit" - cv2.namedWindow(winname=title, flags=cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE) - - processing_times = collections.deque() - while True: - # Grab the frame. - frame = player.next() - if frame is None: - print("Source ended") - break - # If the frame is larger than full HD, reduce size to improve the performance. - scale = 720 / max(frame.shape) - if scale < 1: - frame = cv2.resize( - src=frame, - dsize=None, - fx=scale, - fy=scale, - interpolation=cv2.INTER_AREA, - ) - # Preprocess the input image. - - image = preprocess_images(frame, H, W) - - # Measure processing time for the input image. - start_time = time.time() - # Perform the inference step. - stylized_image = compiled_model([image])[output_layer] - stop_time = time.time() - - # Postprocessing for stylized image. - result_image = convert_result_to_image(frame, stylized_image) - - processing_times.append(stop_time - start_time) - # Use processing times from last 200 frames. - if len(processing_times) > 200: - processing_times.popleft() - processing_time_det = np.mean(processing_times) * 1000 - - # Visualize the results. - f_height, f_width = frame.shape[:2] - fps = 1000 / processing_time_det - cv2.putText( - result_image, - text=f"Inference time: {processing_time_det:.1f}ms ({fps:.1f} FPS)", - org=(20, 40), - fontFace=cv2.FONT_HERSHEY_COMPLEX, - fontScale=f_width / 1000, - color=(0, 0, 255), - thickness=1, - lineType=cv2.LINE_AA, - ) - - # Use this workaround if there is flickering. - if use_popup: - cv2.imshow(title, result_image) - key = cv2.waitKey(1) - # escape = 27 - if key == 27: - break - else: - # Encode numpy array to jpg. - _, encoded_img = cv2.imencode(".jpg", result_image, params=[cv2.IMWRITE_JPEG_QUALITY, 90]) - # Create an IPython image. - i = Image(data=encoded_img) - # Display the image in this notebook. - clear_output(wait=True) - display(i) - # ctrl-c - except KeyboardInterrupt: - print("Interrupted") - # any different error - except RuntimeError as e: - print(e) - finally: - if player is not None: - # Stop capturing. - player.stop() - if use_popup: - cv2.destroyAllWindows() - -Run Style Transfer -~~~~~~~~~~~~~~~~~~ - - - -Now, try to apply the style transfer model using video from your webcam -or video file. By default, the primary webcam is set with ``source=0``. -If you have multiple webcams, each one will be assigned a consecutive -number starting at 0. Set ``flip=True`` when using a front-facing -camera. Some web browsers, especially Mozilla Firefox, may cause -flickering. If you experience flickering, set ``use_popup=True``. - - **NOTE**: To use a webcam, you must run this Jupyter notebook on a - computer with a webcam. If you run it on a server, you will not be - able to access the webcam. However, you can still perform inference - on a video file in the final step. - -If you do not have a webcam, you can still run this demo with a video -file. Any `format supported by -OpenCV `__ - -.. code:: ipython3 - - USE_WEBCAM = False - - cam_id = 0 - video_file = Path("coco.mp4") - video_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/Coco%20Walking%20in%20Berkeley.mp4" - - source = cam_id if USE_WEBCAM else video_file - if not USE_WEBCAM and not video_file.exists(): - utils.download_file(video_url, filename="coco.mp4") - - run_style_transfer(source=source, flip=isinstance(source, int), use_popup=False) - -References ----------- - - - -1. `ONNX Model Zoo `__ -2. `Fast Neural Style - Transfer `__ -3. `Fast Neural Style Mosaic Onnx - Open Model - Zoo `__ diff --git a/docs/notebooks/surya-line-level-text-detection-with-output.rst b/docs/notebooks/surya-line-level-text-detection-with-output.rst deleted file mode 100644 index c133649d15f9f1..00000000000000 --- a/docs/notebooks/surya-line-level-text-detection-with-output.rst +++ /dev/null @@ -1,562 +0,0 @@ -Line-level text detection with Surya -==================================== - -|Colab| - -In this tutorial we will perform line-level text detection using -`Surya `__ toolkit and OpenVINO. - -.. figure:: https://github.com/VikParuchuri/surya/blob/master/static/images/excerpt.png?raw=true - :alt: line-level text detection - - line-level text detection - -`\**image source\* `__ - -Model used for line-level text detection based on -`Segformer `__. It has the -following features: \* It is specialized for document OCR. It will -likely not work on photos or other images. \* It is for printed text, -not handwriting. \* The model has trained itself to ignore -advertisements. \* Languages with very different character sets may not -work well. - - -**Table of contents:** - - -- `Fetch test image <#fetch-test-image>`__ -- `Run PyTorch inference <#run-pytorch-inference>`__ -- `Convert model to OpenVINO Intermediate Representation (IR) - format <#convert-model-to-openvino-intermediate-representation-ir-format>`__ -- `Run OpenVINO model <#run-openvino-model>`__ -- `Apply post-training quantization using - NNCF <#apply-post-training-quantization-using-nncf>`__ - - - `Prepare dataset <#prepare-dataset>`__ - - `Quantize model <#quantize-model>`__ - -- `Run quantized OpenVINO model <#run-quantized-openvino-model>`__ -- `Interactive inference <#interactive-inference>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. |Colab| image:: https://colab.research.google.com/assets/colab-badge.svg - :target: https://colab.research.google.com/github/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/surya-line-level-text-detection/surya-line-level-text-detection.ipynb - -Fetch test image ----------------- - - - -We will use an image from a randomly sampled subset of -`DocLayNet `__ dataset. - -.. code:: ipython3 - - import os - - os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - - %pip install -q "openvino>=2024.2.0" "nncf>=2.11.0" - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "surya-ocr==0.4.0" torch datasets "gradio>=4.19" Pillow - -.. code:: ipython3 - - from datasets import load_dataset - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("surya-line-level-text-detection.ipynb") - - - def fetch_image(): - dataset = load_dataset("vikp/doclaynet_bench", split="train", streaming=True) - return next(iter(dataset))["image"] - - - test_image = fetch_image() - test_image - - - - -.. image:: surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_3_0.png - - - -Run PyTorch inference ---------------------- - - - -To perform line-level text detection we will use ``load_model`` and -``load_processor`` functions from ``surya`` package. We will also use -``batch_inference`` function which performs pre and post processing. - -.. code:: ipython3 - - # Predictions visualization function - from PIL import ImageDraw - - - def visualize_prediction(image, prediction): - image = image.copy() - draw = ImageDraw.Draw(image) - - for polygon_box in prediction.bboxes: - draw.rectangle(polygon_box.bbox, width=1, outline="red") - - display(image) - -.. code:: ipython3 - - from surya.detection import batch_text_detection - from surya.model.detection.segformer import load_model, load_processor - - model, processor = load_model(), load_processor() - - predictions = batch_text_detection([test_image], model, processor) - - visualize_prediction(test_image, predictions[0]) - - -.. parsed-literal:: - - /home/ea/work/py311/lib/python3.11/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. - _torch_pytree._register_pytree_node( - /home/ea/work/py311/lib/python3.11/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. - _torch_pytree._register_pytree_node( - /home/ea/work/py311/lib/python3.11/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. - _torch_pytree._register_pytree_node( - /home/ea/work/py311/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. - warnings.warn( - - - -.. parsed-literal:: - - config.json: 0%| | 0.00/1.18k [00:00`__. -The ``ov.convert_model`` Python function returns an OpenVINO Model -object ready to load on the device and start making predictions. - -``ov.convert_model`` requires a sample of original model input. We will -use image pre-processing from ``surya`` package to prepare example -input. - -.. code:: ipython3 - - # Build example input - from surya.input.processing import prepare_image - import torch - - - def build_example_input(image, processor): - input_values = prepare_image(image.convert("RGB"), processor) - - return {"pixel_values": torch.unsqueeze(input_values, 0)} - - - example_input = build_example_input(test_image, processor) - -.. code:: ipython3 - - # Convert model - import openvino as ov - from pathlib import Path - - ov_model = ov.convert_model(model, example_input=example_input) - - FP_MODEL_PATH = Path("model.xml") - INT8_MODEL_PATH = Path("int8_model.xml") - - ov.save_model(ov_model, FP_MODEL_PATH) - -Run OpenVINO model ------------------- - - - -Select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -We want to reuse model results postprocessing implemented in -``batch_inference`` function. In order to do that we implement simple -wrappers for OpenVINO model with interface required by -``batch_inference`` function. - -.. code:: ipython3 - - core = ov.Core() - - # Compile OpenVINO model for loading on device - compiled_ov_model = core.compile_model(ov_model, device.value) - - - class OVModelWrapperResult: - def __init__(self, logits): - self.logits = logits - - - class OVModelWrapper: - dtype = torch.float32 - device = model.device - config = model.config - - def __init__(self, ov_model) -> None: - self.ov_model = ov_model - - def __call__(self, **kwargs): - # run inference on preprocessed data and get image-text similarity score - logits = self.ov_model(kwargs)[0] - return OVModelWrapperResult(torch.from_numpy(logits)) - - - ov_model_wrapper = OVModelWrapper(compiled_ov_model) - - ov_predictions = batch_text_detection([test_image], ov_model_wrapper, processor) - - visualize_prediction(test_image, ov_predictions[0]) - - -.. parsed-literal:: - - Detecting bboxes: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00, 1.04s/it] - - - -.. image:: surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_13_1.png - - -Apply post-training quantization using NNCF -------------------------------------------- - - - -`NNCF `__ enables -post-training quantization by adding the quantization layers into the -model graph and then using a subset of the training dataset to -initialize the parameters of these additional quantization layers. The -framework is designed so that modifications to your original training -code are minor. Quantization is the simplest scenario and requires a few -modifications. - -The optimization process contains the following steps: - -1. Create a dataset for quantization. -2. Run ``nncf.quantize`` for getting a quantized model. - -Please select below whether you would like to run quantization to -improve model inference speed. - - **NOTE**: Quantization is time and memory consuming operation. - Running quantization code below may take a long time. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - to_quantize = quantization_widget() - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -.. code:: ipython3 - - import requests - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - %load_ext skip_kernel_extension - -Free resources before quantization. - -.. code:: ipython3 - - import gc - - del model - del ov_model - del compiled_ov_model - del ov_model_wrapper - - gc.collect(); - -Prepare dataset -~~~~~~~~~~~~~~~ - - - -We create calibration dataset with randomly sampled set of images from -`DocLayNet `__. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from surya.input.processing import split_image - - - def prepare_calibration_dataset(size=1, buffer_size=1): - - def collate_fn(data): - image = data[0]["image"].convert("RGB") - image_splits, _ = split_image(image, processor) - image_splits = prepare_image(image_splits[0], processor) - - return image_splits - - dataset = load_dataset("vikp/doclaynet_bench", split="train", streaming=True) - train_dataset = dataset.shuffle(seed=42, buffer_size=buffer_size) - dataloader = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1) - - def prepare_calibration_data(dataloader, size): - data = [] - counter = 0 - for batch in dataloader: - if counter == size: - break - counter += 1 - batch = batch.to(torch.float32) - batch = batch.to("cpu") - data.append({"pixel_values": torch.stack([batch])}) - return data - - return prepare_calibration_data(dataloader, size) - - - calibration_dataset = prepare_calibration_dataset() - -Quantize model -~~~~~~~~~~~~~~ - - - -Create a quantized model from the ``FP16`` model. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import nncf - - quantized_ov_model = nncf.quantize( - model=core.read_model(FP_MODEL_PATH), - calibration_dataset=nncf.Dataset(calibration_dataset), - advanced_parameters=nncf.AdvancedQuantizationParameters( - activations_quantization_params=nncf.quantization.advanced_parameters.QuantizationParameters(per_channel=False) - ), - ) - - ov.save_model(quantized_ov_model, INT8_MODEL_PATH) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino - - - -.. parsed-literal:: - - Output() - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - -Run quantized OpenVINO model ----------------------------- - - - -Now we ready to detect lines with ``int8`` OpenVINO model. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - # Compile OpenVINO model for loading on device - compiled_int8_ov_model = core.compile_model(quantized_ov_model, device.value) - - int8_ov_model_wrapper = OVModelWrapper(compiled_int8_ov_model) - - int8_ov_predictions = batch_text_detection([test_image], int8_ov_model_wrapper, processor) - - visualize_prediction(test_image, int8_ov_predictions[0]) - - -.. parsed-literal:: - - Detecting bboxes: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1.10it/s] - - - -.. image:: surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_24_1.png - - -Interactive inference ---------------------- - - - -Now, it is your turn! Feel free to upload an image, using the file -upload window. - -Below you can select which model to run: original or quantized. - -.. code:: ipython3 - - from pathlib import Path - import ipywidgets as widgets - - quantized_model_present = Path(INT8_MODEL_PATH).exists() - - use_quantized_model = widgets.Checkbox( - value=True if quantized_model_present else False, - description="Use quantized model", - disabled=not quantized_model_present, - ) - - use_quantized_model - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Use quantized model') - - - -.. code:: ipython3 - - import gradio as gr - - compiled_model = ov.compile_model(INT8_MODEL_PATH if use_quantized_model.value else FP_MODEL_PATH, device.value) - - - def predict(image): - predictions = batch_text_detection([image], OVModelWrapper(compiled_model), processor) - - image = image.copy() - draw = ImageDraw.Draw(image) - - for polygon_box in predictions[0].bboxes: - draw.rectangle(polygon_box.bbox, width=1, outline="red") - - return image - - - demo = gr.Interface( - predict, - gr.Image(label="Image", type="pil", format="pil"), - gr.Image(label="Result"), - examples=[test_image], - ) - try: - demo.launch(debug=True, height=1000) - except Exception: - demo.launch(share=True, debug=True, height=1000) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ diff --git a/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_13_1.jpg b/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_13_1.jpg deleted file mode 100644 index 8c996f359a5f67..00000000000000 --- a/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_13_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6f4230aa8bbf269d52e2364d88a770afeb4610d3be30b31ea5ccb274e5b954ab -size 298986 diff --git a/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_13_1.png b/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_13_1.png deleted file mode 100644 index 0140f259d9ccba..00000000000000 --- a/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_13_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9e420dfa7ed3606369bd76745136f89325b23ff6e946884fdb8fd7b7b1169133 -size 633348 diff --git a/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_24_1.jpg b/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_24_1.jpg deleted file mode 100644 index 5d15347b092273..00000000000000 --- a/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_24_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2ea09bcaf8c6a5b53ffb2b7fad7a792b796b64ab1b6f9fe4974babb0892806f3 -size 299421 diff --git a/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_24_1.png b/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_24_1.png deleted file mode 100644 index 92df063fe3d589..00000000000000 --- a/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_24_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c7d0385314c6cc2c08dd054432e28d07a4e568c3d4bb5991ce64dde2c2492d50 -size 633432 diff --git a/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_3_0.jpg b/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_3_0.jpg deleted file mode 100644 index 533b9014c3d773..00000000000000 --- a/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_3_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e2c367b91ed537bafcb82662f7e926d785a732ee70f4e72b970692a63f2737bc -size 226877 diff --git a/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_3_0.png b/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_3_0.png deleted file mode 100644 index 57c1df57774b5f..00000000000000 --- a/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_3_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3c1119947fab21f597cfac3c068084eaf0feecad075cf6381d1db23c030551d4 -size 634906 diff --git a/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_6_6.jpg b/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_6_6.jpg deleted file mode 100644 index 8c996f359a5f67..00000000000000 --- a/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_6_6.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6f4230aa8bbf269d52e2364d88a770afeb4610d3be30b31ea5ccb274e5b954ab -size 298986 diff --git a/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_6_6.png b/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_6_6.png deleted file mode 100644 index 0140f259d9ccba..00000000000000 --- a/docs/notebooks/surya-line-level-text-detection-with-output_files/surya-line-level-text-detection-with-output_6_6.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9e420dfa7ed3606369bd76745136f89325b23ff6e946884fdb8fd7b7b1169133 -size 633348 diff --git a/docs/notebooks/table-question-answering-with-output.rst b/docs/notebooks/table-question-answering-with-output.rst deleted file mode 100644 index 3dc5e651aca44b..00000000000000 --- a/docs/notebooks/table-question-answering-with-output.rst +++ /dev/null @@ -1,363 +0,0 @@ -Table Question Answering using TAPAS and OpenVINO™ -================================================== - -Table Question Answering (Table QA) is the answering a question about an -information on a given table. You can use the Table Question Answering -models to simulate SQL execution by inputting a table. - -In this tutorial we demonstrate how to perform table question answering -using OpenVINO. This example based on `TAPAS base model fine-tuned on -WikiTable Questions -(WTQ) `__ that -is based on the paper `TAPAS: Weakly Supervised Table Parsing via -Pre-training `__. - -Answering natural language questions over tables is usually seen as a -semantic parsing task. To alleviate the collection cost of full logical -forms, one popular approach focuses on weak supervision consisting of -denotations instead of logical forms. However, training semantic parsers -from weak supervision poses difficulties, and in addition, the generated -logical forms are only used as an intermediate step prior to retrieving -the denotation. In `this -paper `__, it is presented TAPAS, -an approach to question answering over tables without generating logical -forms. TAPAS trains from weak supervision, and predicts the denotation -by selecting table cells and optionally applying a corresponding -aggregation operator to such selection. TAPAS extends BERT’s -architecture to encode tables as input, initializes from an effective -joint pre-training of text segments and tables crawled from Wikipedia, -and is trained end-to-end. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Use the original model to run an - inference <#use-the-original-model-to-run-an-inference>`__ -- `Convert the original model to OpenVINO Intermediate Representation - (IR) - format <#convert-the-original-model-to-openvino-intermediate-representation-ir-format>`__ -- `Run the OpenVINO model <#run-the-openvino-model>`__ -- `Interactive inference <#interactive-inference>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - - %pip install -q "transformers>=4.31.0" "torch>=2.1" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "openvino>=2023.2.0" "gradio>=4.0.2" - -.. code:: ipython3 - - import torch - from transformers import TapasForQuestionAnswering - from transformers import TapasTokenizer - from transformers import pipeline - import pandas as pd - from pathlib import Path - - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("table-question-answering.ipynb") - -Use ``TapasForQuestionAnswering.from_pretrained`` to download a -pretrained model and ``TapasTokenizer.from_pretrained`` to get a -tokenizer. - -.. code:: ipython3 - - model = TapasForQuestionAnswering.from_pretrained("google/tapas-large-finetuned-wtq") - tokenizer = TapasTokenizer.from_pretrained("google/tapas-large-finetuned-wtq") - - data = { - "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], - "Number of movies": ["87", "53", "69"], - } - table = pd.DataFrame.from_dict(data) - question = "how many movies does Leonardo Di Caprio have?" - table - - - - -.. raw:: html - -
- - - - - - - - - - - - - - - - - - - - - - - - - - -
ActorsNumber of movies
0Brad Pitt87
1Leonardo Di Caprio53
2George Clooney69
-
- - - -Use the original model to run an inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We use `this -example `__ to -demonstrate how to make an inference. You can use ``pipeline`` from -``transformer`` library for this purpose. - -.. code:: ipython3 - - tqa = pipeline(task="table-question-answering", model=model, tokenizer=tokenizer) - result = tqa(table=table, query=question) - print(f"The answer is {result['cells'][0]}") - - -.. parsed-literal:: - - The answer is 53 - - -You can read more about the inference output structure in `this -documentation `__. - -Convert the original model to OpenVINO Intermediate Representation (IR) format -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The original model is a PyTorch module, that can be converted with -``ov.convert_model`` function directly. We also use ``ov.save_model`` -function to serialize the result of conversion. - -.. code:: ipython3 - - import openvino as ov - from pathlib import Path - - - # Define the input shape - batch_size = 1 - sequence_length = 29 - - # Modify the input shape of the dummy_input dictionary - dummy_input = { - "input_ids": torch.zeros((batch_size, sequence_length), dtype=torch.long), - "attention_mask": torch.zeros((batch_size, sequence_length), dtype=torch.long), - "token_type_ids": torch.zeros((batch_size, sequence_length, 7), dtype=torch.long), - } - - - ov_model_xml_path = Path("models/ov_model.xml") - - if not ov_model_xml_path.exists(): - ov_model = ov.convert_model(model, example_input=dummy_input) - ov.save_model(ov_model, ov_model_xml_path) - -Run the OpenVINO model -~~~~~~~~~~~~~~~~~~~~~~ - - - -Select a device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -We use ``ov.compile_model`` to make it ready to use for loading on a -device. To prepare inputs use the original ``tokenizer``. - -.. code:: ipython3 - - core = ov.Core() - - inputs = tokenizer(table=table, queries=question, padding="max_length", return_tensors="pt") - - compiled_model = core.compile_model(ov_model_xml_path, device.value) - result = compiled_model((inputs["input_ids"], inputs["attention_mask"], inputs["token_type_ids"])) - -Now we should postprocess results. For this, we can use the appropriate -part of the code from -`postprocess `__ -method of ``TableQuestionAnsweringPipeline``. - -.. code:: ipython3 - - logits = result[0] - logits_aggregation = result[1] - - - predictions = tokenizer.convert_logits_to_predictions(inputs, torch.from_numpy(result[0])) - answer_coordinates_batch = predictions[0] - aggregators = {} - aggregators_prefix = {} - answers = [] - for index, coordinates in enumerate(answer_coordinates_batch): - cells = [table.iat[coordinate] for coordinate in coordinates] - aggregator = aggregators.get(index, "") - aggregator_prefix = aggregators_prefix.get(index, "") - answer = { - "answer": aggregator_prefix + ", ".join(cells), - "coordinates": coordinates, - "cells": [table.iat[coordinate] for coordinate in coordinates], - } - if aggregator: - answer["aggregator"] = aggregator - - answers.append(answer) - - print(answers[0]["cells"][0]) - - -.. parsed-literal:: - - 53 - - -Also, we can use the original pipeline. For this, we should create a -wrapper for ``TapasForQuestionAnswering`` class replacing ``forward`` -method to use the OpenVINO model for inference and methods and -attributes of original model class to be integrated into the pipeline. - -.. code:: ipython3 - - from transformers import TapasConfig - - - # get config for pretrained model - config = TapasConfig.from_pretrained("google/tapas-large-finetuned-wtq") - - - class TapasForQuestionAnswering(TapasForQuestionAnswering): # it is better to keep the class name to avoid warnings - def __init__(self, ov_model_path): - super().__init__(config) # pass config from the pretrained model - self.tqa_model = core.compile_model(ov_model_path, device.value) - - def forward(self, input_ids, *, attention_mask, token_type_ids): - results = self.tqa_model((input_ids, attention_mask, token_type_ids)) - - return torch.from_numpy(results[0]), torch.from_numpy(results[1]) - - - compiled_model = TapasForQuestionAnswering(ov_model_xml_path) - tqa = pipeline(task="table-question-answering", model=compiled_model, tokenizer=tokenizer) - print(tqa(table=table, query=question)["cells"][0]) - - -.. parsed-literal:: - - 53 - - -Interactive inference -~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - import pandas as pd - - - def highlight_answers(x, coordinates): - highlighted_table = pd.DataFrame("", index=x.index, columns=x.columns) - for coordinates_i in coordinates: - highlighted_table.iloc[coordinates_i[0], coordinates_i[1]] = "background-color: lightgreen" - - return highlighted_table - - - def infer(query, csv_file_name): - table = pd.read_csv(csv_file_name.name, delimiter=",") - table = table.astype(str) - - result = tqa(table=table, query=query) - table = table.style.apply(highlight_answers, axis=None, coordinates=result["coordinates"]) - - return result["answer"], table - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/table-question-answering/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=infer) - - try: - demo.queue().launch(debug=True) - except Exception: - demo.queue().launch(share=True, debug=True) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ diff --git a/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output.rst b/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output.rst deleted file mode 100644 index dca4264cf8b0d4..00000000000000 --- a/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output.rst +++ /dev/null @@ -1,578 +0,0 @@ -Big Transfer Image Classification Model Quantization pipeline with NNCF -======================================================================= - -This tutorial demonstrates the Quantization of the Big Transfer Image -Classification model, which is fine-tuned on the sub-set of ImageNet -dataset with 10 class labels with -`NNCF `__. It uses -`BiT-M-R50x1/1 `__ -model, which is trained on ImageNet-21k. Big Transfer is a recipe for -pre-training image classification models on large supervised datasets -and efficiently fine-tuning them on any given target task. The recipe -achieves excellent performance on a wide variety of tasks, even when -using very few labeled examples from the target dataset. This tutorial -uses OpenVINO backend for performing model quantization in NNCF. - - -**Table of contents:** - - -- `Prepare Dataset <#prepare-dataset>`__ -- `Plotting data samples <#plotting-data-samples>`__ -- `Model Fine-tuning <#model-fine-tuning>`__ -- `Perform model optimization (IR) - step <#perform-model-optimization-ir-step>`__ -- `Compute accuracy of the TF - model <#compute-accuracy-of-the-tf-model>`__ -- `Compute accuracy of the OpenVINO - model <#compute-accuracy-of-the-openvino-model>`__ -- `Quantize OpenVINO model using - NNCF <#quantize-openvino-model-using-nncf>`__ -- `Compute accuracy of the quantized - model <#compute-accuracy-of-the-quantized-model>`__ -- `Compare FP32 and INT8 accuracy <#compare-fp32-and-int8-accuracy>`__ -- `Compare inference results on one - picture <#compare-inference-results-on-one-picture>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - import platform - - %pip install -q "tensorflow-macos>=2.5; sys_platform == 'darwin' and platform_machine == 'arm64' and python_version > '3.8'" # macOS M1 and M2 - %pip install -q "tensorflow>=2.5; sys_platform == 'darwin' and platform_machine != 'arm64' and python_version > '3.8'" # macOS x86 - %pip install -q "tensorflow>=2.5; sys_platform != 'darwin' and python_version > '3.8'" - %pip install -q --no-deps "tensorflow-hub>=0.15.0" - %pip install -q "openvino>=2024.0.0" "nncf>=2.7.0" tf_keras - %pip install -q "scikit-learn>=1.3.2" - - if platform.system() != "Windows": - %pip install -q "matplotlib>=3.4" "tensorflow_datasets>=4.9.0" - else: - %pip install -q "matplotlib>=3.4" "tensorflow_datasets>=4.9.0,<4.9.3" - -.. code:: ipython3 - - import os - import numpy as np - from pathlib import Path - import openvino as ov - import nncf - import logging - - from nncf.common.logging.logger import set_log_level - - set_log_level(logging.ERROR) - - from sklearn.metrics import accuracy_score - - os.environ["TF_USE_LEGACY_KERAS"] = "1" - os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" - os.environ["TFHUB_CACHE_DIR"] = str(Path("./tfhub_modules").resolve()) - - import tensorflow as tf - import tensorflow_datasets as tfds - import tensorflow_hub as hub - - tfds.core.utils.gcs_utils._is_gcs_disabled = True - os.environ["NO_GCE_CHECK"] = "true" - - import requests - - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("tensorflow-bit-image-classification-nncf-quantization.ipynb") - -.. code:: ipython3 - - core = ov.Core() - tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) - - - # For top 5 labels. - MAX_PREDS = 1 - TRAINING_BATCH_SIZE = 128 - BATCH_SIZE = 1 - IMG_SIZE = (256, 256) # Default Imagenet image size - NUM_CLASSES = 10 # For Imagenette dataset - FINE_TUNING_STEPS = 1 - LR = 1e-5 - - MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255) # From Imagenet dataset - STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255) # From Imagenet dataset - -Prepare Dataset -~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - datasets, datasets_info = tfds.load( - "imagenette/160px", - shuffle_files=True, - as_supervised=True, - with_info=True, - read_config=tfds.ReadConfig(shuffle_seed=0), - ) - train_ds, validation_ds = datasets["train"], datasets["validation"] - -.. code:: ipython3 - - def preprocessing(image, label): - image = tf.image.resize(image, IMG_SIZE) - image = tf.cast(image, tf.float32) / 255.0 - label = tf.one_hot(label, NUM_CLASSES) - return image, label - - - train_dataset = train_ds.map(preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(TRAINING_BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE) - validation_dataset = ( - validation_ds.map(preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(TRAINING_BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE) - ) - -.. code:: ipython3 - - # Class labels dictionary with imagenette sample names and classes - lbl_dict = dict( - n01440764="tench", - n02102040="English springer", - n02979186="cassette player", - n03000684="chain saw", - n03028079="church", - n03394916="French horn", - n03417042="garbage truck", - n03425413="gas pump", - n03445777="golf ball", - n03888257="parachute", - ) - - # Imagenette samples name index - class_idx_dict = [ - "n01440764", - "n02102040", - "n02979186", - "n03000684", - "n03028079", - "n03394916", - "n03417042", - "n03425413", - "n03445777", - "n03888257", - ] - - - def label_func(key): - return lbl_dict[key] - -Plotting data samples -~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - import matplotlib.pyplot as plt - - # Get the class labels from the dataset info - class_labels = datasets_info.features["label"].names - - # Display labels along with the examples - num_examples_to_display = 4 - fig, axes = plt.subplots(nrows=1, ncols=num_examples_to_display, figsize=(10, 5)) - - for i, (image, label_index) in enumerate(train_ds.take(num_examples_to_display)): - label_name = class_labels[label_index.numpy()] - - axes[i].imshow(image.numpy()) - axes[i].set_title(f"{label_func(label_name)}") - axes[i].axis("off") - plt.tight_layout() - plt.show() - - -.. parsed-literal:: - - 2024-01-26 10:40:54.747316: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead. - - - -.. image:: tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_9_1.png - - -.. code:: ipython3 - - # Get the class labels from the dataset info - class_labels = datasets_info.features["label"].names - - # Display labels along with the examples - num_examples_to_display = 4 - fig, axes = plt.subplots(nrows=1, ncols=num_examples_to_display, figsize=(10, 5)) - - for i, (image, label_index) in enumerate(validation_ds.take(num_examples_to_display)): - label_name = class_labels[label_index.numpy()] - - axes[i].imshow(image.numpy()) - axes[i].set_title(f"{label_func(label_name)}") - axes[i].axis("off") - plt.tight_layout() - plt.show() - - -.. parsed-literal:: - - 2024-01-26 10:40:57.011386: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead. - - - -.. image:: tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_10_1.png - - -Model Fine-tuning -~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # Load the Big Transfer model - tf_model_dir = Path("bit_tf_model") - - if not tf_model_dir.exists(): - bit_model_url = "https://www.kaggle.com/models/google/bit/frameworks/TensorFlow2/variations/m-r50x1/versions/1" - bit_m = hub.KerasLayer(bit_model_url, trainable=True) - # Customize the model for the new task - model = tf.keras.Sequential([bit_m, tf.keras.layers.Dense(NUM_CLASSES, activation="softmax")]) - - # Compile the model - model.compile( - optimizer=tf.keras.optimizers.Adam(learning_rate=LR), - loss="categorical_crossentropy", - metrics=["accuracy"], - ) - - # Fine-tune the model - model.fit( - train_dataset.take(3000), - epochs=FINE_TUNING_STEPS, - validation_data=validation_dataset.take(1000), - ) - model.save(tf_model_dir, save_format="tf") - - -.. parsed-literal:: - - 101/101 [==============================] - 472s 4s/step - loss: 0.4904 - accuracy: 0.8806 - val_loss: 0.0810 - val_accuracy: 0.9840 - - -Perform model optimization (IR) step -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - ir_path = Path("bit_ov_model/bit_m_r50x1_1.xml") - if not ir_path.exists(): - print("Initiating model optimization..!!!") - ov_model = ov.convert_model("./bit_tf_model") - ov.save_model(ov_model, ir_path) - else: - print(f"IR model {ir_path} already exists.") - - -.. parsed-literal:: - - Initiating model optimization..!!! - - -Compute accuracy of the TF model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - tf_model = tf.keras.models.load_model(tf_model_dir) - - tf_predictions = [] - gt_label = [] - - for _, label in validation_dataset: - for cls_label in label: - l_list = cls_label.numpy().tolist() - gt_label.append(l_list.index(1)) - - for img_batch, label_batch in validation_dataset: - tf_result_batch = tf_model.predict(img_batch, verbose=0) - for i in range(len(img_batch)): - tf_result = tf_result_batch[i] - tf_result = tf.reshape(tf_result, [-1]) - top5_label_idx = np.argsort(tf_result)[-MAX_PREDS::][::-1] - tf_predictions.append(top5_label_idx) - - # Convert the lists to NumPy arrays for accuracy calculation - tf_predictions = np.array(tf_predictions) - gt_label = np.array(gt_label) - - tf_acc_score = accuracy_score(tf_predictions, gt_label) - - -.. parsed-literal:: - - 2024-01-26 10:51:24.539777: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 're_lu_48/PartitionedCall' has 1 outputs but the _output_shapes attribute specifies shapes for 2 outputs. Output shapes may be inaccurate. - 2024-01-26 10:51:24.539856: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'global_average_pooling2d/PartitionedCall' has 1 outputs but the _output_shapes attribute specifies shapes for 3 outputs. Output shapes may be inaccurate. - - -Compute accuracy of the OpenVINO model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Select device for inference: - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - -.. code:: ipython3 - - core = ov.Core() - - ov_fp32_model = core.read_model(ir_path) - ov_fp32_model.reshape([1, IMG_SIZE[0], IMG_SIZE[1], 3]) - - # Target device set to CPU (Other options Ex: AUTO/GPU/dGPU/) - compiled_model = ov.compile_model(ov_fp32_model, device.value) - output = compiled_model.outputs[0] - - ov_predictions = [] - for img_batch, _ in validation_dataset: - for image in img_batch: - image = tf.expand_dims(image, axis=0) - pred = compiled_model(image)[output] - ov_result = tf.reshape(pred, [-1]) - top_label_idx = np.argsort(ov_result)[-MAX_PREDS::][::-1] - ov_predictions.append(top_label_idx) - - fp32_acc_score = accuracy_score(ov_predictions, gt_label) - -Quantize OpenVINO model using NNCF -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Model Quantization using NNCF - -1. Preprocessing and preparing validation samples for NNCF calibration -2. Perform NNCF Quantization on OpenVINO FP32 model -3. Serialize Quantized OpenVINO INT8 model - -.. code:: ipython3 - - def nncf_preprocessing(image, label): - image = tf.image.resize(image, IMG_SIZE) - image = image - MEAN_RGB - image = image / STDDEV_RGB - return image - - - int8_ir_path = Path("bit_ov_int8_model/bit_m_r50x1_1_ov_int8.xml") - val_ds = validation_ds.map(nncf_preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(1).prefetch(tf.data.experimental.AUTOTUNE) - - calibration_dataset = nncf.Dataset(val_ds) - - ov_fp32_model = core.read_model(ir_path) - - ov_int8_model = nncf.quantize(ov_fp32_model, calibration_dataset, fast_bias_correction=False) - - ov.save_model(ov_int8_model, int8_ir_path) - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -Compute accuracy of the quantized model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - nncf_quantized_model = core.read_model(int8_ir_path) - nncf_quantized_model.reshape([1, IMG_SIZE[0], IMG_SIZE[1], 3]) - - # Target device set to CPU by default - compiled_model = ov.compile_model(nncf_quantized_model, device.value) - output = compiled_model.outputs[0] - - ov_predictions = [] - inp_tensor = nncf_quantized_model.inputs[0] - out_tensor = nncf_quantized_model.outputs[0] - - for img_batch, _ in validation_dataset: - for image in img_batch: - image = tf.expand_dims(image, axis=0) - pred = compiled_model(image)[output] - ov_result = tf.reshape(pred, [-1]) - top_label_idx = np.argsort(ov_result)[-MAX_PREDS::][::-1] - ov_predictions.append(top_label_idx) - - int8_acc_score = accuracy_score(ov_predictions, gt_label) - -Compare FP32 and INT8 accuracy -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - print(f"Accuracy of the tensorflow model (fp32): {tf_acc_score * 100: .2f}%") - print(f"Accuracy of the OpenVINO optimized model (fp32): {fp32_acc_score * 100: .2f}%") - print(f"Accuracy of the OpenVINO quantized model (int8): {int8_acc_score * 100: .2f}%") - accuracy_drop = fp32_acc_score - int8_acc_score - print(f"Accuracy drop between OV FP32 and INT8 model: {accuracy_drop * 100:.1f}% ") - - -.. parsed-literal:: - - Accuracy of the tensorflow model (fp32): 98.40% - Accuracy of the OpenVINO optimized model (fp32): 98.40% - Accuracy of the OpenVINO quantized model (int8): 98.00% - Accuracy drop between OV FP32 and INT8 model: 0.4% - - -Compare inference results on one picture -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # Accessing validation sample - sample_idx = 50 - vds = datasets["validation"] - - if len(vds) > sample_idx: - sample = vds.take(sample_idx + 1).skip(sample_idx).as_numpy_iterator().next() - else: - print("Dataset does not have enough samples...!!!") - - # Image data - sample_data = sample[0] - - # Label info - sample_label = sample[1] - - # Image data pre-processing - image = tf.image.resize(sample_data, IMG_SIZE) - image = tf.expand_dims(image, axis=0) - image = tf.cast(image, tf.float32) / 255.0 - - - # OpenVINO inference - def ov_inference(model: ov.Model, image) -> str: - compiled_model = ov.compile_model(model, device.value) - output = compiled_model.outputs[0] - pred = compiled_model(image)[output] - ov_result = tf.reshape(pred, [-1]) - pred_label = np.argsort(ov_result)[-MAX_PREDS::][::-1] - return pred_label - - - # OpenVINO FP32 model - ov_fp32_model = core.read_model(ir_path) - ov_fp32_model.reshape([1, IMG_SIZE[0], IMG_SIZE[1], 3]) - - # OpenVINO INT8 model - ov_int8_model = core.read_model(int8_ir_path) - ov_int8_model.reshape([1, IMG_SIZE[0], IMG_SIZE[1], 3]) - - # OpenVINO FP32 model inference - ov_fp32_pred_label = ov_inference(ov_fp32_model, image) - - print(f"Predicted label for the sample picture by float (fp32) model: {label_func(class_idx_dict[int(ov_fp32_pred_label)])}\n") - - # OpenVINO FP32 model inference - ov_int8_pred_label = ov_inference(ov_int8_model, image) - print(f"Predicted label for the sample picture by qunatized (int8) model: {label_func(class_idx_dict[int(ov_int8_pred_label)])}\n") - - # Plotting the image sample with ground truth - plt.figure() - plt.imshow(sample_data) - plt.title(f"Ground truth: {label_func(class_idx_dict[sample_label])}") - plt.axis("off") - plt.show() - - -.. parsed-literal:: - - Predicted label for the sample picture by float (fp32) model: gas pump - - Predicted label for the sample picture by qunatized (int8) model: gas pump - - - - -.. image:: tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_27_1.png - diff --git a/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_10_1.png b/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_10_1.png deleted file mode 100644 index 71aa7443a92cd8..00000000000000 --- a/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_10_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c7b53b19fd375df2b53791482fa4f76ec9d376be865f1298f4ea5aa0acdb1f35 -size 224517 diff --git a/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_27_1.png b/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_27_1.png deleted file mode 100644 index 38f050c05e472a..00000000000000 --- a/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_27_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:048e8ff7b7ac7fa5f9cb66251d618f1ae941f26255f62c725d6223abd63e6fb7 -size 335047 diff --git a/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_9_1.png b/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_9_1.png deleted file mode 100644 index a8d02fcbd58c16..00000000000000 --- a/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_9_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3bf1b651f79891da47103dcc27259f890451c392325a712ff4c1b1cace7cb4be -size 296205 diff --git a/docs/notebooks/tensorflow-classification-to-openvino-with-output.rst b/docs/notebooks/tensorflow-classification-to-openvino-with-output.rst deleted file mode 100644 index dd935990109248..00000000000000 --- a/docs/notebooks/tensorflow-classification-to-openvino-with-output.rst +++ /dev/null @@ -1,313 +0,0 @@ -Convert a TensorFlow Model to OpenVINO™ -======================================= - -This short tutorial shows how to convert a TensorFlow -`MobileNetV3 `__ -image classification model to OpenVINO `Intermediate -Representation `__ -(OpenVINO IR) format, using `Model Conversion -API `__. -After creating the OpenVINO IR, load the model in `OpenVINO -Runtime `__ -and do inference with a sample image. - - -**Table of contents:** - - -- `Imports <#imports>`__ -- `Settings <#settings>`__ -- `Download model <#download-model>`__ -- `Convert a Model to OpenVINO IR - Format <#convert-a-model-to-openvino-ir-format>`__ - - - `Convert a TensorFlow Model to OpenVINO IR - Format <#convert-a-tensorflow-model-to-openvino-ir-format>`__ - -- `Test Inference on the Converted - Model <#test-inference-on-the-converted-model>`__ - - - `Load the Model <#load-the-model>`__ - -- `Select inference device <#select-inference-device>`__ - - - `Get Model Information <#get-model-information>`__ - - `Load an Image <#load-an-image>`__ - - `Do Inference <#do-inference>`__ - -- `Timing <#timing>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. code:: ipython3 - - # Install openvino package - %pip install -q "openvino>=2023.1.0" "opencv-python" - %pip install -q "matplotlib>=3.4" - %pip install -q "tensorflow-macos>=2.5; sys_platform == 'darwin' and platform_machine == 'arm64' and python_version > '3.8'" # macOS M1 and M2 - %pip install -q "tensorflow>=2.5; sys_platform == 'darwin' and platform_machine != 'arm64' and python_version > '3.8'" # macOS x86 - %pip install -q "tensorflow>=2.5; sys_platform != 'darwin' and python_version > '3.8'" - %pip install -q --no-deps tensorflow_hub - %pip install -q tf_keras tqdm - -Imports -------- - - - -.. code:: ipython3 - - import os - import time - from pathlib import Path - - os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" - os.environ["TF_USE_LEGACY_KERAS"] = "1" - - import cv2 - import matplotlib.pyplot as plt - import numpy as np - import openvino as ov - import tensorflow as tf - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("tensorflow-classification-to-openvino.ipynb") - -Settings --------- - - - -.. code:: ipython3 - - # The paths of the source and converted models. - model_dir = Path("model") - model_dir.mkdir(exist_ok=True) - - model_path = Path("model/v3-small_224_1.0_float") - - ir_path = Path("model/v3-small_224_1.0_float.xml") - -Download model --------------- - - - -Load model using `tf.keras.applications -api `__ -and save it to the disk. - -.. code:: ipython3 - - model = tf.keras.applications.MobileNetV3Small() - model.save(model_path) - -Convert a Model to OpenVINO IR Format -------------------------------------- - - - -Convert a TensorFlow Model to OpenVINO IR Format -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Use the model conversion Python API to convert the TensorFlow model to -OpenVINO IR. The ``ov.convert_model`` function accept path to saved -model directory and returns OpenVINO Model class instance which -represents this model. Obtained model is ready to use and to be loaded -on a device using ``ov.compile_model`` or can be saved on a disk using -the ``ov.save_model`` function. See the -`tutorial `__ -for more information about using model conversion API with TensorFlow -models. - -.. code:: ipython3 - - # Run model conversion API if the IR model file does not exist - if not ir_path.exists(): - print("Exporting TensorFlow model to IR... This may take a few minutes.") - ov_model = ov.convert_model(model_path, input=[[1, 224, 224, 3]]) - ov.save_model(ov_model, ir_path) - else: - print(f"IR model {ir_path} already exists.") - - -.. parsed-literal:: - - Exporting TensorFlow model to IR... This may take a few minutes. - - -Test Inference on the Converted Model -------------------------------------- - - - -Load the Model -~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - core = ov.Core() - model = core.read_model(ir_path) - -Select inference device ------------------------ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device = device_widget() - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - compiled_model = core.compile_model(model=model, device_name=device.value) - -Get Model Information -~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - input_key = compiled_model.input(0) - output_key = compiled_model.output(0) - network_input_shape = input_key.shape - -Load an Image -~~~~~~~~~~~~~ - - - -Load an image, resize it, and convert it to the input shape of the -network. - -.. code:: ipython3 - - # Download the image from the openvino_notebooks storage - image_filename = Path("data/coco.jpg") - - if not image_filename.exists(): - download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco.jpg", - directory="data", - ) - - # The MobileNet network expects images in RGB format. - image = cv2.cvtColor(cv2.imread(filename=str(image_filename)), code=cv2.COLOR_BGR2RGB) - - # Resize the image to the network input shape. - resized_image = cv2.resize(src=image, dsize=(224, 224)) - - # Transpose the image to the network input shape. - input_image = np.expand_dims(resized_image, 0) - - plt.imshow(image); - - - -.. image:: tensorflow-classification-to-openvino-with-output_files/tensorflow-classification-to-openvino-with-output_19_0.png - - -Do Inference -~~~~~~~~~~~~ - - - -.. code:: ipython3 - - result = compiled_model(input_image)[output_key] - - result_index = np.argmax(result) - -.. code:: ipython3 - - # Download the datasets from the openvino_notebooks storage - labels_filename = Path("data/imagenet_2012.txt") - if not labels_filename.exists(): - download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/datasets/imagenet/imagenet_2012.txt", - directory="data", - ) - - # Convert the inference result to a class name. - imagenet_classes = labels_filename.read_text().splitlines() - - imagenet_classes[result_index] - - - - -.. parsed-literal:: - - 'n02099267 flat-coated retriever' - - - -Timing ------- - - - -Measure the time it takes to do inference on thousand images. This gives -an indication of performance. For more accurate benchmarking, use the -`Benchmark -Tool `__ -in OpenVINO. Note that many optimizations are possible to improve the -performance. - -.. code:: ipython3 - - num_images = 1000 - - start = time.perf_counter() - - for _ in range(num_images): - compiled_model([input_image]) - - end = time.perf_counter() - time_ir = end - start - - print(f"IR model in OpenVINO Runtime/CPU: {time_ir/num_images:.4f} " f"seconds per image, FPS: {num_images/time_ir:.2f}") - - -.. parsed-literal:: - - IR model in OpenVINO Runtime/CPU: 0.0011 seconds per image, FPS: 924.82 - diff --git a/docs/notebooks/tensorflow-classification-to-openvino-with-output_files/tensorflow-classification-to-openvino-with-output_19_0.png b/docs/notebooks/tensorflow-classification-to-openvino-with-output_files/tensorflow-classification-to-openvino-with-output_19_0.png deleted file mode 100644 index 15c5ba574c1614..00000000000000 --- a/docs/notebooks/tensorflow-classification-to-openvino-with-output_files/tensorflow-classification-to-openvino-with-output_19_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7511b8a4e5b047600d5fed14fbc7e9653a868bc5253abf1e0c3ef649b47bc408 -size 387941 diff --git a/docs/notebooks/tensorflow-hub-with-output.rst b/docs/notebooks/tensorflow-hub-with-output.rst deleted file mode 100644 index a958f94b7e81ae..00000000000000 --- a/docs/notebooks/tensorflow-hub-with-output.rst +++ /dev/null @@ -1,491 +0,0 @@ -Convert of TensorFlow Hub models to OpenVINO Intermediate Representation (IR) -============================================================================= - -|Colab| |Binder| - -This tutorial demonstrates step-by-step instructions on how to convert -models loaded from TensorFlow Hub using OpenVINO Runtime. - -`TensorFlow Hub `__ is a library and online platform -developed by Google that simplifies machine learning model reuse and -sharing. It serves as a repository of pre-trained models, embeddings, -and reusable components, allowing researchers and developers to access -and integrate state-of-the-art machine learning models into their own -projects with ease. TensorFlow Hub provides a diverse range of models -for various tasks like image classification, text embedding, and more. -It streamlines the process of incorporating these models into TensorFlow -workflows, fostering collaboration and accelerating the development of -AI applications. This centralized hub enhances model accessibility and -promotes the rapid advancement of machine learning capabilities across -the community. - -You have the flexibility to run this tutorial notebook in its entirety -or selectively execute specific sections, as each section operates -independently. - - -**Table of contents:** - - -- `Install required packages <#install-required-packages>`__ -- `Image classification <#image-classification>`__ - - - `Import libraries <#import-libraries>`__ - - `Download the classifier <#download-the-classifier>`__ - - `Download a single image to try the model - on <#download-a-single-image-to-try-the-model-on>`__ - - `Convert model to OpenVINO IR <#convert-model-to-openvino-ir>`__ - - `Select inference device <#select-inference-device>`__ - - `Inference <#inference>`__ - -- `Image style transfer <#image-style-transfer>`__ - - - `Install required packages <#install-required-packages>`__ - - `Load the model <#load-the-model>`__ - - `Convert the model to OpenVINO - IR <#convert-the-model-to-openvino-ir>`__ - - `Select inference device <#select-inference-device>`__ - - `Inference <#inference>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. |Colab| image:: https://colab.research.google.com/assets/colab-badge.svg - :target: https://colab.research.google.com/github/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/tensorflow-hub/tensorflow-hub.ipynb -.. |Binder| image:: https://mybinder.org/badge_logo.svg - :target: https://mybinder.org/v2/gh/eaidova/openvino_notebooks_binder.git/main?urlpath=git-pull%3Frepo%3Dhttps%253A%252F%252Fgithub.com%252Fopenvinotoolkit%252Fopenvino_notebooks%26urlpath%3Dtree%252Fopenvino_notebooks%252Fnotebooks%2Ftensorflow-hub%2Ftensorflow-hub.ipynb - -Install required packages -------------------------- - - - -.. code:: ipython3 - - %pip install -q pillow numpy - %pip install -q "openvino>=2023.2.0" "opencv-python" "matplotlib>=3.4" - - %pip install -q "tensorflow-macos>=2.5; sys_platform == 'darwin' and platform_machine == 'arm64'" - %pip install -q "tensorflow>=2.5; sys_platform == 'darwin' and platform_machine != 'arm64'" - %pip install -q "tensorflow>=2.5; sys_platform != 'darwin'" - %pip install -q --no-deps tensorflow_hub - %pip install -q tf_keras - -Image classification --------------------- - - - -We will use the `MobileNet_v2 `__ -image classification model from `TensorFlow Hub `__. - -MobileNetV2 is a compact and efficient deep learning architecture -designed for mobile and embedded devices, developed by Google -researchers. It builds on the success of the original MobileNet by -introducing improvements in both speed and accuracy. MobileNetV2 employs -a streamlined architecture with inverted residual blocks, making it -highly efficient for real-time applications while minimizing -computational resources. This network excels in tasks like image -classification, object detection, and image segmentation, offering a -balance between model size and performance. MobileNetV2 has become a -popular choice for on-device AI applications, enabling faster and more -efficient deep learning inference on smartphones and edge devices. - -More information about model can be found on `Model page on TensorFlow -Hub `__ - -Import libraries -~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from pathlib import Path - import os - import requests - - os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" - os.environ["TF_USE_LEGACY_KERAS"] = "1" - os.environ["TFHUB_CACHE_DIR"] = str(Path("./tfhub_modules").resolve()) - - import tensorflow_hub as hub - import tensorflow as tf - import PIL - import numpy as np - import matplotlib.pyplot as plt - - import openvino as ov - - tf.get_logger().setLevel("ERROR") - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("tensorflow-hub.ipynb") - -.. code:: ipython3 - - IMAGE_SHAPE = (224, 224) - IMAGE_URL, IMAGE_PATH = ( - "https://storage.googleapis.com/download.tensorflow.org/example_images/grace_hopper.jpg", - "data/grace_hopper.jpg", - ) - MODEL_URL, MODEL_PATH = ( - "https://www.kaggle.com/models/google/mobilenet-v1/frameworks/tensorFlow2/variations/100-224-classification/versions/2", - "models/mobilenet_v2_100_224.xml", - ) - -Download the classifier -~~~~~~~~~~~~~~~~~~~~~~~ - -Select a MobileNetV2 -pre-trained model `from TensorFlow -Hub `__ -and wrap it as a Keras layer with ``hub.KerasLayer``. - -.. code:: ipython3 - - model = hub.KerasLayer(MODEL_URL, input_shape=IMAGE_SHAPE + (3,)) - -Download a single image to try the model on -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The input ``images`` are -expected to have color values in the range [0,1], following the `common -image input -conventions `__. -For this model, the size of the input images is fixed to ``height`` x -``width`` = 224 x 224 pixels. - -.. code:: ipython3 - - IMAGE_PATH = Path(IMAGE_PATH) - - IMAGE_PATH.parent.mkdir(parents=True, exist_ok=True) - - if not IMAGE_PATH.exists(): - r = requests.get(IMAGE_URL) - with IMAGE_PATH.open("wb") as f: - f.write(r.content) - grace_hopper = PIL.Image.open(IMAGE_PATH).resize(IMAGE_SHAPE) - grace_hopper - - - - -.. image:: tensorflow-hub-with-output_files/tensorflow-hub-with-output_11_0.png - - - -Normalize the image to [0,1] range. - -.. code:: ipython3 - - grace_hopper = np.array(grace_hopper) / 255.0 - grace_hopper.shape - - - - -.. parsed-literal:: - - (224, 224, 3) - - - -Convert model to OpenVINO IR -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We will convert the loaded model to OpenVINO IR using -``ov.convert_model`` function. We pass the model object to it, no -additional arguments required. Then, we save the model to disk using -``ov.save_model`` function. - -.. code:: ipython3 - - MODEL_PATH = Path(MODEL_PATH) - - if not MODEL_PATH.exists(): - converted_model = ov.convert_model(model) - ov.save_model(converted_model, MODEL_PATH) - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - core = ov.Core() - - compiled_model = core.compile_model(MODEL_PATH, device_name=device.value) - -Inference -~~~~~~~~~ - - - -Add a batch dimension (with ``np.newaxis``) and pass the image to the -model: - -.. code:: ipython3 - - output = compiled_model(grace_hopper[np.newaxis, ...])[0] - output.shape - - - - -.. parsed-literal:: - - (1, 1001) - - - -The result is a 1001-element vector of logits, rating the probability of -each class for the image. - -The top class ID can be found with ``np.argmax``: - -.. code:: ipython3 - - predicted_class = np.argmax(output[0], axis=-1) - predicted_class - - - - -.. parsed-literal:: - - 653 - - - -Take the ``predicted_class`` ID (such as ``653``) and fetch the ImageNet -dataset labels to decode the predictions: - -.. code:: ipython3 - - labels_path = tf.keras.utils.get_file( - "ImageNetLabels.txt", - "https://storage.googleapis.com/download.tensorflow.org/data/ImageNetLabels.txt", - ) - imagenet_labels = np.array(open(labels_path).read().splitlines()) - plt.imshow(grace_hopper) - plt.axis("off") - predicted_class_name = imagenet_labels[predicted_class] - _ = plt.title("Prediction: " + predicted_class_name.title()) - - - -.. image:: tensorflow-hub-with-output_files/tensorflow-hub-with-output_26_0.png - - -Image style transfer --------------------- - - - -We will use `arbitrary image stylization -model `__ from `TensorFlow -Hub `__. - -The model contains conditional instance normalization (CIN) layers - -The CIN network consists of two main components: a feature extractor and -a stylization module. The feature extractor extracts a set of features -from the content image. The stylization module then uses these features -to generate a stylized image. - -The stylization module is a stack of convolutional layers. Each -convolutional layer is followed by a CIN layer. The CIN layer takes the -features from the previous layer and the CIN parameters from the style -image as input and produces a new set of features as output. - -The output of the stylization module is a stylized image. The stylized -image has the same content as the original content image, but the style -has been transferred from the style image. - -The CIN network is able to stylize images in real time because it is -very efficient. - -More model information can be found on `Model page on TensorFlow -Hub `__. - -.. code:: ipython3 - - import os - - os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" - os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" - os.environ["TF_USE_LEGACY_KERAS"] = "1" - os.environ["TFHUB_CACHE_DIR"] = str(Path("./tfhub_modules").resolve()) - from pathlib import Path - - import openvino as ov - - import tensorflow_hub as hub - import tensorflow as tf - import cv2 - import numpy as np - import matplotlib.pyplot as plt - -.. code:: ipython3 - - CONTENT_IMAGE_URL = "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/525babb8-1289-45f8-a3a5-e248f74dfb24" - CONTENT_IMAGE_PATH = Path("./data/YellowLabradorLooking_new.jpg") - - STYLE_IMAGE_URL = "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/c212233d-9a33-4979-b8f9-2a94a529026e" - STYLE_IMAGE_PATH = Path("./data/Vassily_Kandinsky%2C_1913_-_Composition_7.jpg") - - MODEL_URL = "https://www.kaggle.com/models/google/arbitrary-image-stylization-v1/frameworks/tensorFlow1/variations/256/versions/2" - MODEL_PATH = Path("./models/arbitrary-image-stylization-v1-256.xml") - -Load the model -~~~~~~~~~~~~~~ - - - -We load the model from TensorFlow Hub using ``hub.KerasLayer``. Since -the model has multiple inputs (content image and style image), we need -to build it by calling with placeholders and wrap in ``tf.keras.Model`` -function. - -.. code:: ipython3 - - inputs = { - "placeholder": tf.keras.layers.Input(shape=(None, None, 3)), - "placeholder_1": tf.keras.layers.Input(shape=(None, None, 3)), - } - model = hub.KerasLayer(MODEL_URL, signature="serving_default", signature_outputs_as_dict=True) # define the signature to allow passing inputs as a dictionary - outputs = model(inputs) - model = tf.keras.Model(inputs=inputs, outputs=outputs) - -Convert the model to OpenVINO IR -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -We convert the loaded model to OpenVINO IR using ``ov.convert_model`` -function. We pass our model to the function, no additional arguments -needed. After converting, we save the model to disk using -``ov.save_model`` function. - -.. code:: ipython3 - - if not MODEL_PATH.exists(): - MODEL_PATH.parent.mkdir(parents=True, exist_ok=True) - converted_model = ov.convert_model(model) - ov.save_model(converted_model, MODEL_PATH) - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - compiled_model = core.compile_model(MODEL_PATH, device_name=device.value) - -Inference -~~~~~~~~~ - - - -.. code:: ipython3 - - if not STYLE_IMAGE_PATH.exists(): - r = requests.get(STYLE_IMAGE_URL) - with STYLE_IMAGE_PATH.open("wb") as f: - f.write(r.content) - if not CONTENT_IMAGE_PATH.exists(): - r = requests.get(CONTENT_IMAGE_URL) - with CONTENT_IMAGE_PATH.open("wb") as f: - f.write(r.content) - - - def load_image(dst): - image = cv2.imread(str(dst)) - image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert image color to RGB space - image = image / 255 # Normalize to [0, 1] interval - image = image.astype(np.float32) - return image - -.. code:: ipython3 - - content_image = load_image(CONTENT_IMAGE_PATH) - style_image = load_image(STYLE_IMAGE_PATH) - style_image = cv2.resize(style_image, (256, 256)) # model was trained on 256x256 images - -.. code:: ipython3 - - result = compiled_model([content_image[np.newaxis, ...], style_image[np.newaxis, ...]])[0] - -.. code:: ipython3 - - title2img = { - "Source image": content_image, - "Reference style": style_image, - "Result": result[0], - } - plt.figure(figsize=(12, 12)) - for i, (title, img) in enumerate(title2img.items()): - ax = plt.subplot(1, 3, i + 1) - ax.set_title(title) - plt.imshow(img) - plt.axis("off") - - - -.. image:: tensorflow-hub-with-output_files/tensorflow-hub-with-output_43_0.png - diff --git a/docs/notebooks/tensorflow-hub-with-output_files/tensorflow-hub-with-output_11_0.png b/docs/notebooks/tensorflow-hub-with-output_files/tensorflow-hub-with-output_11_0.png deleted file mode 100644 index f0c32e219fe5b4..00000000000000 --- a/docs/notebooks/tensorflow-hub-with-output_files/tensorflow-hub-with-output_11_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cf7ea6c175c39ff93d61ab187e4569910a96d1fc44012fe4bb05f5ec43c5f516 -size 85029 diff --git a/docs/notebooks/tensorflow-hub-with-output_files/tensorflow-hub-with-output_26_0.png b/docs/notebooks/tensorflow-hub-with-output_files/tensorflow-hub-with-output_26_0.png deleted file mode 100644 index 4f1da92e5284c7..00000000000000 --- a/docs/notebooks/tensorflow-hub-with-output_files/tensorflow-hub-with-output_26_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3050fd069390a5410693d7649a6d666fe41504c7b1543897a83170e6cd8a6a5f -size 203738 diff --git a/docs/notebooks/tensorflow-hub-with-output_files/tensorflow-hub-with-output_43_0.png b/docs/notebooks/tensorflow-hub-with-output_files/tensorflow-hub-with-output_43_0.png deleted file mode 100644 index db16fca26331fa..00000000000000 --- a/docs/notebooks/tensorflow-hub-with-output_files/tensorflow-hub-with-output_43_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0c31f759683e5a545ba98521191e1ac6c83f2ae9853d2293b9832753f9f0e701 -size 538743 diff --git a/docs/notebooks/tensorflow-instance-segmentation-to-openvino-with-output.rst b/docs/notebooks/tensorflow-instance-segmentation-to-openvino-with-output.rst deleted file mode 100644 index 2ca3078e97cef5..00000000000000 --- a/docs/notebooks/tensorflow-instance-segmentation-to-openvino-with-output.rst +++ /dev/null @@ -1,715 +0,0 @@ -Convert a TensorFlow Instance Segmentation Model to OpenVINO™ -============================================================= - -`TensorFlow `__, or TF for short, is an -open-source framework for machine learning. - -The `TensorFlow Object Detection -API `__ -is an open-source computer vision framework built on top of TensorFlow. -It is used for building object detection and instance segmentation -models that can localize multiple objects in the same image. TensorFlow -Object Detection API supports various architectures and models, which -can be found and downloaded from the `TensorFlow -Hub `__. - -This tutorial shows how to convert a TensorFlow `Mask R-CNN with -Inception ResNet -V2 `__ -instance segmentation model to OpenVINO `Intermediate -Representation `__ -(OpenVINO IR) format, using `Model Conversion -API `__. -After creating the OpenVINO IR, load the model in `OpenVINO -Runtime `__ -and do inference with a sample image. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Imports <#imports>`__ -- `Settings <#settings>`__ -- `Download Model from TensorFlow - Hub <#download-model-from-tensorflow-hub>`__ -- `Convert Model to OpenVINO IR <#convert-model-to-openvino-ir>`__ -- `Test Inference on the Converted - Model <#test-inference-on-the-converted-model>`__ -- `Select inference device <#select-inference-device>`__ - - - `Load the Model <#load-the-model>`__ - - `Get Model Information <#get-model-information>`__ - - `Get an Image for Test - Inference <#get-an-image-for-test-inference>`__ - - `Perform Inference <#perform-inference>`__ - - `Inference Result - Visualization <#inference-result-visualization>`__ - -- `Next Steps <#next-steps>`__ - - - `Async inference pipeline <#async-inference-pipeline>`__ - - `Integration preprocessing to - model <#integration-preprocessing-to-model>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Install required packages: - -.. code:: ipython3 - - %pip install -q "openvino>=2023.1.0" "numpy>=1.21.0" "opencv-python" "tqdm" - - %pip install -q "matplotlib>=3.4" - - %pip install -q "tensorflow-macos>=2.5; sys_platform == 'darwin' and platform_machine == 'arm64'" # macOS M1 and M2 - %pip install -q "tensorflow>=2.5; sys_platform == 'darwin' and platform_machine != 'arm64'" # macOS x86 - %pip install -q "tensorflow>=2.5; sys_platform != 'darwin'" - -The notebook uses utility functions. The cell below will download the -``notebook_utils`` Python module from GitHub. - -.. code:: ipython3 - - # Fetch the notebook utils script from the openvino_notebooks repo - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("tensorflow-instance-segmentation-to-openvino.ipynb") - -Imports -------- - - - -.. code:: ipython3 - - # Standard python modules - from pathlib import Path - - # External modules and dependencies - import cv2 - import matplotlib.pyplot as plt - import numpy as np - - # Notebook utils module - from notebook_utils import download_file, device_widget - - # OpenVINO modules - import openvino as ov - -Settings --------- - - - -Define model related variables and create corresponding directories: - -.. code:: ipython3 - - # Create directories for models files - model_dir = Path("is-model") - model_dir.mkdir(exist_ok=True) - - # Create directory for TensorFlow model - tf_model_dir = model_dir / "tf" - tf_model_dir.mkdir(exist_ok=True) - - # Create directory for OpenVINO IR model - ir_model_dir = model_dir / "ir" - ir_model_dir.mkdir(exist_ok=True) - - model_name = "mask_rcnn_inception_resnet_v2_1024x1024" - - openvino_ir_path = ir_model_dir / f"{model_name}.xml" - - tf_model_url = ( - "https://www.kaggle.com/models/tensorflow/mask-rcnn-inception-resnet-v2/frameworks/tensorFlow2/variations/1024x1024/versions/1?tf-hub-format=compressed" - ) - - tf_model_archive_filename = f"{model_name}.tar.gz" - -Download Model from TensorFlow Hub ----------------------------------- - - - -Download archive with TensorFlow Instance Segmentation model -(`mask_rcnn_inception_resnet_v2_1024x1024 `__) -from TensorFlow Hub: - -.. code:: ipython3 - - if not (tf_model_dir / tf_model_archive_filename).exists(): - download_file(url=tf_model_url, filename=tf_model_archive_filename, directory=tf_model_dir); - - - -.. parsed-literal:: - - model/tf/mask_rcnn_inception_resnet_v2_1024x1024.tar.gz: 0%| | 0.00/232M [00:00`__. -Optionally, we can apply compression to FP16 model weights using -``compress_to_fp16=True`` option and integrate preprocessing using this -approach. - -The converted model is ready to load on a device using ``compile_model`` -or saved on disk using the ``serialize`` function to reduce loading time -when the model is run in the future. - -.. code:: ipython3 - - ov_model = ov.convert_model(tf_model_dir) - - # Save converted OpenVINO IR model to the corresponding directory - ov.save_model(ov_model, openvino_ir_path) - -Test Inference on the Converted Model -------------------------------------- - - - -Select inference device ------------------------ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - core = ov.Core() - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -Load the Model -~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - openvino_ir_model = core.read_model(openvino_ir_path) - compiled_model = core.compile_model(model=openvino_ir_model, device_name=device.value) - -Get Model Information -~~~~~~~~~~~~~~~~~~~~~ - - - -Mask R-CNN with Inception ResNet V2 instance segmentation model has one -input - a three-channel image of variable size. The input tensor shape -is ``[1, height, width, 3]`` with values in ``[0, 255]``. - -Model output dictionary contains a lot of tensors, we will use only 5 of -them: - ``num_detections``: A ``tf.int`` tensor with only one value, the -number of detections ``[N]``. - ``detection_boxes``: A ``tf.float32`` -tensor of shape ``[N, 4]`` containing bounding box coordinates in the -following order: ``[ymin, xmin, ymax, xmax]``. - ``detection_classes``: -A ``tf.int`` tensor of shape ``[N]`` containing detection class index -from the label file. - ``detection_scores``: A ``tf.float32`` tensor of -shape ``[N]`` containing detection scores. - ``detection_masks``: A -``[batch, max_detections, mask_height, mask_width]`` tensor. Note that a -pixel-wise sigmoid score converter is applied to the detection masks. - -For more information about model inputs, outputs and their formats, see -the `model overview page on TensorFlow -Hub `__. - -It is important to mention, that values of ``detection_boxes``, -``detection_classes``, ``detection_scores``, ``detection_masks`` -correspond to each other and are ordered by the highest detection score: -the first detection mask corresponds to the first detection class and to -the first (and highest) detection score. - -.. code:: ipython3 - - model_inputs = compiled_model.inputs - model_outputs = compiled_model.outputs - - print("Model inputs count:", len(model_inputs)) - print("Model inputs:") - for _input in model_inputs: - print(" ", _input) - - print("Model outputs count:", len(model_outputs)) - print("Model outputs:") - for output in model_outputs: - print(" ", output) - - -.. parsed-literal:: - - Model inputs count: 1 - Model inputs: - - Model outputs count: 23 - Model outputs: - - - - - - - - - - - - - - - - - - - - - - - - - -Get an Image for Test Inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Load and save an image: - -.. code:: ipython3 - - image_path = Path("./data/coco_bike.jpg") - - if not image_path.exists(): - download_file( - url="https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bike.jpg", - filename=image_path.name, - directory=image_path.parent, - ); - - -.. parsed-literal:: - - 'data/coco_bike.jpg' already exists. - - -Read the image, resize and convert it to the input shape of the network: - -.. code:: ipython3 - - # Read the image - image = cv2.imread(filename=str(image_path)) - - # The network expects images in RGB format - image = cv2.cvtColor(image, code=cv2.COLOR_BGR2RGB) - - # Resize the image to the network input shape - resized_image = cv2.resize(src=image, dsize=(255, 255)) - - # Add batch dimension to image - network_input_image = np.expand_dims(resized_image, 0) - - # Show the image - plt.imshow(image) - - - - -.. parsed-literal:: - - - - - - -.. image:: tensorflow-instance-segmentation-to-openvino-with-output_files/tensorflow-instance-segmentation-to-openvino-with-output_25_1.png - - -Perform Inference -~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - inference_result = compiled_model(network_input_image) - -After model inference on the test image, instance segmentation data can -be extracted from the result. For further model result visualization -``detection_boxes``, ``detection_masks``, ``detection_classes`` and -``detection_scores`` outputs will be used. - -.. code:: ipython3 - - detection_boxes = compiled_model.output("detection_boxes") - image_detection_boxes = inference_result[detection_boxes] - print("image_detection_boxes:", image_detection_boxes.shape) - - detection_masks = compiled_model.output("detection_masks") - image_detection_masks = inference_result[detection_masks] - print("image_detection_masks:", image_detection_masks.shape) - - detection_classes = compiled_model.output("detection_classes") - image_detection_classes = inference_result[detection_classes] - print("image_detection_classes:", image_detection_classes.shape) - - detection_scores = compiled_model.output("detection_scores") - image_detection_scores = inference_result[detection_scores] - print("image_detection_scores:", image_detection_scores.shape) - - num_detections = compiled_model.output("num_detections") - image_num_detections = inference_result[num_detections] - print("image_detections_num:", image_num_detections) - - # Alternatively, inference result data can be extracted by model output name with `.get()` method - assert (inference_result[detection_boxes] == inference_result.get("detection_boxes")).all(), "extracted inference result data should be equal" - - -.. parsed-literal:: - - image_detection_boxes: (1, 100, 4) - image_detection_masks: (1, 100, 33, 33) - image_detection_classes: (1, 100) - image_detection_scores: (1, 100) - image_detections_num: [100.] - - -Inference Result Visualization -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Define utility functions to visualize the inference results - -.. code:: ipython3 - - from typing import Optional - - - def add_detection_box(box: np.ndarray, image: np.ndarray, mask: np.ndarray, label: Optional[str] = None) -> np.ndarray: - """ - Helper function for adding single bounding box to the image - - Parameters - ---------- - box : np.ndarray - Bounding box coordinates in format [ymin, xmin, ymax, xmax] - image : np.ndarray - The image to which detection box is added - mask: np.ndarray - Segmentation mask in format (H, W) - label : str, optional - Detection box label string, if not provided will not be added to result image (default is None) - - Returns - ------- - np.ndarray - NumPy array including image, detection box, and segmentation mask - - """ - ymin, xmin, ymax, xmax = box - point1, point2 = (int(xmin), int(ymin)), (int(xmax), int(ymax)) - box_color = [np.random.randint(0, 255) for _ in range(3)] - line_thickness = round(0.002 * (image.shape[0] + image.shape[1]) / 2) + 1 - - result = cv2.rectangle( - img=image, - pt1=point1, - pt2=point2, - color=box_color, - thickness=line_thickness, - lineType=cv2.LINE_AA, - ) - - if label: - font_thickness = max(line_thickness - 1, 1) - font_face = 0 - font_scale = line_thickness / 3 - font_color = (255, 255, 255) - text_size = cv2.getTextSize( - text=label, - fontFace=font_face, - fontScale=font_scale, - thickness=font_thickness, - )[0] - # Calculate rectangle coordinates - rectangle_point1 = point1 - rectangle_point2 = (point1[0] + text_size[0], point1[1] - text_size[1] - 3) - # Add filled rectangle - result = cv2.rectangle( - img=result, - pt1=rectangle_point1, - pt2=rectangle_point2, - color=box_color, - thickness=-1, - lineType=cv2.LINE_AA, - ) - # Calculate text position - text_position = point1[0], point1[1] - 3 - # Add text with label to filled rectangle - result = cv2.putText( - img=result, - text=label, - org=text_position, - fontFace=font_face, - fontScale=font_scale, - color=font_color, - thickness=font_thickness, - lineType=cv2.LINE_AA, - ) - mask_img = mask[:, :, np.newaxis] * box_color - result = cv2.addWeighted(result, 1, mask_img.astype(np.uint8), 0.6, 0) - return result - -.. code:: ipython3 - - def get_mask_frame(box, frame, mask): - """ - Transform a binary mask to fit within a specified bounding box in a frame using perspective transformation. - - Args: - box (tuple): A bounding box represented as a tuple (y_min, x_min, y_max, x_max). - frame (numpy.ndarray): The larger frame or image where the mask will be placed. - mask (numpy.ndarray): A binary mask image to be transformed. - - Returns: - numpy.ndarray: A transformed mask image that fits within the specified bounding box in the frame. - """ - x_min = frame.shape[1] * box[1] - y_min = frame.shape[0] * box[0] - x_max = frame.shape[1] * box[3] - y_max = frame.shape[0] * box[2] - rect_src = np.array( - [ - [0, 0], - [mask.shape[1], 0], - [mask.shape[1], mask.shape[0]], - [0, mask.shape[0]], - ], - dtype=np.float32, - ) - rect_dst = np.array( - [[x_min, y_min], [x_max, y_min], [x_max, y_max], [x_min, y_max]], - dtype=np.float32, - ) - M = cv2.getPerspectiveTransform(rect_src[:, :], rect_dst[:, :]) - mask_frame = cv2.warpPerspective(mask, M, (frame.shape[1], frame.shape[0]), flags=cv2.INTER_CUBIC) - return mask_frame - -.. code:: ipython3 - - from typing import Dict - - from openvino.runtime.utils.data_helpers import OVDict - - - def visualize_inference_result( - inference_result: OVDict, - image: np.ndarray, - labels_map: Dict, - detections_limit: Optional[int] = None, - ): - """ - Helper function for visualizing inference result on the image - - Parameters - ---------- - inference_result : OVDict - Result of the compiled model inference on the test image - image : np.ndarray - Original image to use for visualization - labels_map : Dict - Dictionary with mappings of detection classes numbers and its names - detections_limit : int, optional - Number of detections to show on the image, if not provided all detections will be shown (default is None) - """ - detection_boxes = inference_result.get("detection_boxes") - detection_classes = inference_result.get("detection_classes") - detection_scores = inference_result.get("detection_scores") - num_detections = inference_result.get("num_detections") - detection_masks = inference_result.get("detection_masks") - - detections_limit = int(min(detections_limit, num_detections[0]) if detections_limit is not None else num_detections[0]) - - # Normalize detection boxes coordinates to original image size - original_image_height, original_image_width, _ = image.shape - normalized_detection_boxes = detection_boxes[0, :detections_limit] * [ - original_image_height, - original_image_width, - original_image_height, - original_image_width, - ] - result = np.copy(image) - for i in range(detections_limit): - detected_class_name = labels_map[int(detection_classes[0, i])] - score = detection_scores[0, i] - mask = detection_masks[0, i] - mask_reframed = get_mask_frame(detection_boxes[0, i], image, mask) - mask_reframed = (mask_reframed > 0.5).astype(np.uint8) - label = f"{detected_class_name} {score:.2f}" - result = add_detection_box( - box=normalized_detection_boxes[i], - image=result, - mask=mask_reframed, - label=label, - ) - - plt.imshow(result) - -TensorFlow Instance Segmentation model -(`mask_rcnn_inception_resnet_v2_1024x1024 `__) -used in this notebook was trained on `COCO -2017 `__ dataset with 91 classes. For better -visualization experience we can use COCO dataset labels with human -readable class names instead of class numbers or indexes. - -We can download COCO dataset classes labels from `Open Model -Zoo `__: - -.. code:: ipython3 - - coco_labels_file_path = Path("./data/coco_91cl.txt") - - if not coco_labels_file_path.exists(): - download_file( - url="https://raw.githubusercontent.com/openvinotoolkit/open_model_zoo/master/data/dataset_classes/coco_91cl.txt", - filename=coco_labels_file_path.name, - directory=coco_labels_file_path.parent, - ); - - - -.. parsed-literal:: - - data/coco_91cl.txt: 0%| | 0.00/421 [00:00`__. - -Integration preprocessing to model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Preprocessing API enables making preprocessing a part of the model -reducing application code and dependency on additional image processing -libraries. The main advantage of Preprocessing API is that preprocessing -steps will be integrated into the execution graph and will be performed -on a selected device (CPU/GPU etc.) rather than always being executed on -CPU as part of an application. This will improve selected device -utilization. - -For more information, refer to the `Optimize Preprocessing -tutorial `__ and -to the overview of `Preprocessing -API `__. diff --git a/docs/notebooks/tensorflow-instance-segmentation-to-openvino-with-output_files/tensorflow-instance-segmentation-to-openvino-with-output_25_1.png b/docs/notebooks/tensorflow-instance-segmentation-to-openvino-with-output_files/tensorflow-instance-segmentation-to-openvino-with-output_25_1.png deleted file mode 100644 index 8e3d83c55a3716..00000000000000 --- a/docs/notebooks/tensorflow-instance-segmentation-to-openvino-with-output_files/tensorflow-instance-segmentation-to-openvino-with-output_25_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:40be1f179e0fa9db9b42a3dcf241f8d180b1baf3f2b3ffc1f6bb0f5daec1c133 -size 395346 diff --git a/docs/notebooks/tensorflow-instance-segmentation-to-openvino-with-output_files/tensorflow-instance-segmentation-to-openvino-with-output_39_0.png b/docs/notebooks/tensorflow-instance-segmentation-to-openvino-with-output_files/tensorflow-instance-segmentation-to-openvino-with-output_39_0.png deleted file mode 100644 index 17f665bf861591..00000000000000 --- a/docs/notebooks/tensorflow-instance-segmentation-to-openvino-with-output_files/tensorflow-instance-segmentation-to-openvino-with-output_39_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f8120fe97bc4a6f4117643038aa98d0d440b5a24552a0123ed790814b96abccf -size 393496 diff --git a/docs/notebooks/tensorflow-object-detection-to-openvino-with-output.rst b/docs/notebooks/tensorflow-object-detection-to-openvino-with-output.rst deleted file mode 100644 index 295b06aa1e1d17..00000000000000 --- a/docs/notebooks/tensorflow-object-detection-to-openvino-with-output.rst +++ /dev/null @@ -1,771 +0,0 @@ -Convert a TensorFlow Object Detection Model to OpenVINO™ -======================================================== - -`TensorFlow `__, or TF for short, is an -open-source framework for machine learning. - -The `TensorFlow Object Detection -API `__ -is an open-source computer vision framework built on top of TensorFlow. -It is used for building object detection and image segmentation models -that can localize multiple objects in the same image. TensorFlow Object -Detection API supports various architectures and models, which can be -found and downloaded from the `TensorFlow -Hub `__. - -This tutorial shows how to convert a TensorFlow `Faster R-CNN with -Resnet-50 -V1 `__ -object detection model to OpenVINO `Intermediate -Representation `__ -(OpenVINO IR) format, using Model Converter. After creating the OpenVINO -IR, load the model in `OpenVINO -Runtime `__ -and do inference with a sample image. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Imports <#imports>`__ -- `Settings <#settings>`__ -- `Download Model from TensorFlow - Hub <#download-model-from-tensorflow-hub>`__ -- `Convert Model to OpenVINO IR <#convert-model-to-openvino-ir>`__ -- `Test Inference on the Converted - Model <#test-inference-on-the-converted-model>`__ -- `Select inference device <#select-inference-device>`__ - - - `Load the Model <#load-the-model>`__ - - `Get Model Information <#get-model-information>`__ - - `Get an Image for Test - Inference <#get-an-image-for-test-inference>`__ - - `Perform Inference <#perform-inference>`__ - - `Inference Result - Visualization <#inference-result-visualization>`__ - -- `Next Steps <#next-steps>`__ - - - `Async inference pipeline <#async-inference-pipeline>`__ - - `Integration preprocessing to - model <#integration-preprocessing-to-model>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Install required packages: - -.. code:: ipython3 - - %pip install -q "openvino>=2023.1.0" "numpy>=1.21.0" "opencv-python" "tqdm" - - %pip install -q "matplotlib>=3.4" - %pip install -q "tensorflow-macos>=2.5; sys_platform == 'darwin' and platform_machine == 'arm64'" # macOS M1 and M2 - %pip install -q "tensorflow>=2.5; sys_platform == 'darwin' and platform_machine != 'arm64'" # macOS x86 - %pip install -q "tensorflow>=2.5; sys_platform != 'darwin'" - -The notebook uses utility functions. The cell below will download the -``notebook_utils`` Python module from GitHub. - -.. code:: ipython3 - - # Fetch the notebook utils script from the openvino_notebooks repo - import requests - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("tensorflow-object-detection-to-openvino.ipynb") - -Imports -------- - - - -.. code:: ipython3 - - # Standard python modules - from pathlib import Path - - # External modules and dependencies - import cv2 - import matplotlib.pyplot as plt - import numpy as np - - # OpenVINO import - import openvino as ov - - # Notebook utils module - from notebook_utils import download_file, device_widget - -Settings --------- - - - -Define model related variables and create corresponding directories: - -.. code:: ipython3 - - # Create directories for models files - model_dir = Path("od-model") - model_dir.mkdir(exist_ok=True) - - # Create directory for TensorFlow model - tf_model_dir = model_dir / "tf" - tf_model_dir.mkdir(exist_ok=True) - - # Create directory for OpenVINO IR model - ir_model_dir = model_dir / "ir" - ir_model_dir.mkdir(exist_ok=True) - - model_name = "faster_rcnn_resnet50_v1_640x640" - - openvino_ir_path = ir_model_dir / f"{model_name}.xml" - - tf_model_url = "https://www.kaggle.com/models/tensorflow/faster-rcnn-resnet-v1/frameworks/tensorFlow2/variations/faster-rcnn-resnet50-v1-640x640/versions/1?tf-hub-format=compressed" - - tf_model_archive_filename = f"{model_name}.tar.gz" - -Download Model from TensorFlow Hub ----------------------------------- - - - -Download archive with TensorFlow Object Detection model -(`faster_rcnn_resnet50_v1_640x640 `__) -from TensorFlow Hub: - -.. code:: ipython3 - - download_file(url=tf_model_url, filename=tf_model_archive_filename, directory=tf_model_dir) - - - -.. parsed-literal:: - - model/tf/faster_rcnn_resnet50_v1_640x640.tar.gz: 0%| | 0.00/101M [00:00`__. - -The converted model is ready to load on a device using ``compile_model`` -or saved on disk using the ``save_model`` function to reduce loading -time when the model is run in the future. - -See the `Model Preparation -Guide `__ -for more information about model conversion and TensorFlow `models -support `__. - -.. code:: ipython3 - - ov_model = ov.convert_model(tf_model_dir) - - # Save converted OpenVINO IR model to the corresponding directory - ov.save_model(ov_model, openvino_ir_path) - -Test Inference on the Converted Model -------------------------------------- - - - -Select inference device ------------------------ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - core = ov.Core() - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -Load the Model -~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - core = ov.Core() - openvino_ir_model = core.read_model(openvino_ir_path) - compiled_model = core.compile_model(model=openvino_ir_model, device_name=device.value) - -Get Model Information -~~~~~~~~~~~~~~~~~~~~~ - - - -Faster R-CNN with Resnet-50 V1 object detection model has one input - a -three-channel image of variable size. The input tensor shape is -``[1, height, width, 3]`` with values in ``[0, 255]``. - -Model output dictionary contains several tensors: - -- ``num_detections`` - the number of detections in ``[N]`` format. -- ``detection_boxes`` - bounding box coordinates for all ``N`` - detections in ``[ymin, xmin, ymax, xmax]`` format. -- ``detection_classes`` - ``N`` detection class indexes size from the - label file. -- ``detection_scores`` - ``N`` detection scores (confidence) for each - detected class. -- ``raw_detection_boxes`` - decoded detection boxes without Non-Max - suppression. -- ``raw_detection_scores`` - class score logits for raw detection - boxes. -- ``detection_anchor_indices`` - the anchor indices of the detections - after NMS. -- ``detection_multiclass_scores`` - class score distribution (including - background) for detection boxes in the image including background - class. - -In this tutorial we will mostly use ``detection_boxes``, -``detection_classes``, ``detection_scores`` tensors. It is important to -mention, that values of these tensors correspond to each other and are -ordered by the highest detection score: the first detection box -corresponds to the first detection class and to the first (and highest) -detection score. - -See the `model overview page on TensorFlow -Hub `__ -for more information about model inputs, outputs and their formats. - -.. code:: ipython3 - - model_inputs = compiled_model.inputs - model_input = compiled_model.input(0) - model_outputs = compiled_model.outputs - - print("Model inputs count:", len(model_inputs)) - print("Model input:", model_input) - - print("Model outputs count:", len(model_outputs)) - print("Model outputs:") - for output in model_outputs: - print(" ", output) - - -.. parsed-literal:: - - Model inputs count: 1 - Model input: - Model outputs count: 8 - Model outputs: - - - - - - - - - - -Get an Image for Test Inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Load and save an image: - -.. code:: ipython3 - - image_path = Path("./data/coco_bike.jpg") - - download_file( - url="https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bike.jpg", - filename=image_path.name, - directory=image_path.parent, - ) - - -.. parsed-literal:: - - 'data/coco_bike.jpg' already exists. - - - - -.. parsed-literal:: - - PosixPath('/home/ea/work/openvino_notebooks/notebooks/tensorflow-object-detection-to-openvino/data/coco_bike.jpg') - - - -Read the image, resize and convert it to the input shape of the network: - -.. code:: ipython3 - - # Read the image - image = cv2.imread(filename=str(image_path)) - - # The network expects images in RGB format - image = cv2.cvtColor(image, code=cv2.COLOR_BGR2RGB) - - # Resize the image to the network input shape - resized_image = cv2.resize(src=image, dsize=(255, 255)) - - # Transpose the image to the network input shape - network_input_image = np.expand_dims(resized_image, 0) - - # Show the image - plt.imshow(image) - - - - -.. parsed-literal:: - - - - - - -.. image:: tensorflow-object-detection-to-openvino-with-output_files/tensorflow-object-detection-to-openvino-with-output_25_1.png - - -Perform Inference -~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - inference_result = compiled_model(network_input_image) - -After model inference on the test image, object detection data can be -extracted from the result. For further model result visualization -``detection_boxes``, ``detection_classes`` and ``detection_scores`` -outputs will be used. - -.. code:: ipython3 - - ( - _, - detection_boxes, - detection_classes, - _, - detection_scores, - num_detections, - _, - _, - ) = model_outputs - - image_detection_boxes = inference_result[detection_boxes] - print("image_detection_boxes:", image_detection_boxes) - - image_detection_classes = inference_result[detection_classes] - print("image_detection_classes:", image_detection_classes) - - image_detection_scores = inference_result[detection_scores] - print("image_detection_scores:", image_detection_scores) - - image_num_detections = inference_result[num_detections] - print("image_detections_num:", image_num_detections) - - # Alternatively, inference result data can be extracted by model output name with `.get()` method - assert (inference_result[detection_boxes] == inference_result.get("detection_boxes")).all(), "extracted inference result data should be equal" - - -.. parsed-literal:: - - image_detection_boxes: [[[0.1645457 0.54601336 0.8953864 0.85500604] - [0.67189544 0.01240015 0.9843237 0.53085935] - [0.49188587 0.0117609 0.98050654 0.8866383 ] - ... - [0.43604603 0.59332204 0.4692565 0.6341099 ] - [0.46022677 0.59246916 0.48732638 0.61871874] - [0.47092935 0.4351712 0.5583364 0.5072162 ]]] - image_detection_classes: [[18. 2. 2. 3. 2. 8. 2. 2. 3. 2. 4. 4. 2. 4. 16. 1. 1. 2. - 27. 8. 62. 2. 2. 4. 4. 2. 18. 41. 4. 4. 2. 18. 2. 2. 4. 2. - 27. 2. 27. 2. 1. 2. 16. 1. 16. 2. 2. 2. 2. 16. 2. 2. 4. 2. - 1. 33. 4. 15. 3. 2. 2. 1. 2. 1. 4. 2. 3. 11. 4. 35. 4. 1. - 40. 2. 62. 2. 4. 4. 36. 1. 36. 36. 31. 77. 2. 1. 51. 1. 34. 3. - 2. 3. 90. 2. 1. 2. 1. 2. 1. 1. 2. 4. 18. 2. 3. 2. 31. 1. - 1. 2. 2. 33. 41. 41. 31. 3. 1. 36. 3. 15. 27. 27. 4. 4. 2. 37. - 3. 15. 1. 35. 27. 4. 36. 4. 88. 3. 2. 15. 2. 4. 2. 1. 3. 4. - 27. 4. 3. 16. 44. 1. 1. 23. 4. 1. 4. 3. 4. 15. 62. 36. 77. 3. - 1. 28. 27. 35. 2. 36. 75. 28. 27. 8. 3. 36. 4. 44. 2. 35. 4. 1. - 3. 1. 1. 35. 87. 1. 1. 1. 15. 84. 1. 1. 1. 3. 1. 35. 1. 1. - 1. 62. 15. 1. 15. 44. 1. 41. 1. 62. 4. 4. 3. 43. 16. 35. 15. 2. - 4. 34. 14. 3. 62. 33. 4. 41. 2. 35. 18. 3. 15. 1. 27. 4. 87. 2. - 19. 21. 1. 1. 27. 1. 3. 3. 2. 15. 38. 1. 1. 15. 27. 4. 4. 3. - 84. 38. 1. 15. 3. 20. 62. 58. 41. 20. 2. 4. 88. 62. 15. 31. 1. 31. - 14. 19. 4. 1. 2. 8. 18. 15. 4. 2. 2. 2. 31. 84. 15. 3. 28. 2. - 27. 18. 15. 1. 31. 28. 1. 41. 8. 1. 3. 20.]] - image_detection_scores: [[0.9810079 0.9406672 0.9318088 0.877368 0.8406416 0.590001 - 0.55449295 0.53957206 0.49390146 0.48142543 0.46272704 0.44070077 - 0.40116653 0.34708446 0.31795666 0.27489546 0.24746332 0.23632598 - 0.23248206 0.22401379 0.21871354 0.20231584 0.19377239 0.14768413 - 0.1455532 0.14337878 0.12709719 0.12582931 0.11867398 0.11002147 - 0.10564942 0.09225623 0.08963215 0.08887199 0.08704525 0.08072542 - 0.08002211 0.07911447 0.0666113 0.06338121 0.06100726 0.06005874 - 0.05798694 0.05364129 0.0520498 0.05011013 0.04850959 0.04709018 - 0.04469205 0.04128502 0.04075819 0.03989548 0.03523409 0.03272378 - 0.03108071 0.02970156 0.028723 0.02845931 0.02585638 0.02348842 - 0.0233041 0.02148155 0.02133748 0.02086138 0.02035652 0.01959795 - 0.01931953 0.01926655 0.01872199 0.0185623 0.01853302 0.01838779 - 0.01818969 0.01780701 0.01727104 0.0166365 0.01586579 0.01579063 - 0.01573381 0.01528252 0.01502847 0.01451413 0.01439992 0.01428944 - 0.01419329 0.01380476 0.01360496 0.0129911 0.01249144 0.01198867 - 0.01148862 0.01145841 0.01144459 0.01139607 0.01113943 0.01108592 - 0.01089338 0.01082358 0.01051232 0.01027328 0.01006837 0.00979451 - 0.0097324 0.00960593 0.00957182 0.00953105 0.00949826 0.00942655 - 0.00942555 0.00931226 0.00907306 0.00887798 0.00884452 0.00881256 - 0.00864548 0.00854316 0.00849879 0.00849662 0.00846909 0.00820138 - 0.00816586 0.00791354 0.00790157 0.0076993 0.00768906 0.00766408 - 0.00766065 0.00764457 0.0074557 0.00721993 0.00706666 0.00700596 - 0.0067884 0.00648049 0.00646963 0.0063817 0.00635814 0.00625102 - 0.0062297 0.00599666 0.00591931 0.00585055 0.00578007 0.00576511 - 0.00572359 0.00560452 0.00558355 0.00556507 0.00553867 0.00548295 - 0.00547356 0.00543471 0.00543378 0.00540831 0.0053792 0.00535764 - 0.00523385 0.00518935 0.00505314 0.00505005 0.00492085 0.0048256 - 0.00471783 0.00470318 0.00464703 0.00461124 0.004583 0.00457273 - 0.00455803 0.00454314 0.00454088 0.00441311 0.00437612 0.00426319 - 0.00420744 0.00415996 0.00409997 0.00409557 0.00407971 0.00405195 - 0.00404085 0.00399853 0.00399512 0.00393439 0.00390283 0.00387302 - 0.0038489 0.00382758 0.00380028 0.00379529 0.00376791 0.00374193 - 0.00371191 0.0036963 0.00366445 0.00358808 0.00351783 0.00350439 - 0.00344527 0.00343266 0.00342918 0.0033823 0.00332239 0.00330844 - 0.00329753 0.00327267 0.00315135 0.0031098 0.00308979 0.00308362 - 0.00305496 0.00304868 0.00304044 0.00303659 0.00302582 0.00301237 - 0.00298851 0.00291267 0.00290264 0.00289242 0.00287722 0.00286563 - 0.0028257 0.00282502 0.00275258 0.00274531 0.0027204 0.00268617 - 0.00261917 0.00260795 0.00256594 0.00254094 0.00252856 0.00250768 - 0.00249793 0.00249551 0.00248255 0.00247911 0.00246619 0.00241695 - 0.00240165 0.00236032 0.00235902 0.00234437 0.00234337 0.0023379 - 0.00233535 0.00230773 0.00230558 0.00229113 0.00228888 0.0022631 - 0.00225214 0.00224186 0.00222553 0.00219966 0.00219677 0.00217865 - 0.00217775 0.00215921 0.0021541 0.00214997 0.00212954 0.00211928 - 0.0021005 0.00205066 0.0020487 0.00203887 0.00203537 0.00203026 - 0.00201357 0.00199936 0.00199386 0.00197951 0.00197287 0.00195502 - 0.00194848 0.00192128 0.00189951 0.00187285 0.0018519 0.0018299 - 0.00179158 0.00177908 0.00176328 0.00176319 0.00175034 0.00173788 - 0.00172983 0.00172819 0.00168272 0.0016768 0.00167543 0.00167397 - 0.0016395 0.00163637 0.00163319 0.00162886 0.00162824 0.00162028]] - image_detections_num: [300.] - - -Inference Result Visualization -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Define utility functions to visualize the inference results - -.. code:: ipython3 - - from typing import Optional - - - def add_detection_box(box: np.ndarray, image: np.ndarray, label: Optional[str] = None) -> np.ndarray: - """ - Helper function for adding single bounding box to the image - - Parameters - ---------- - box : np.ndarray - Bounding box coordinates in format [ymin, xmin, ymax, xmax] - image : np.ndarray - The image to which detection box is added - label : str, optional - Detection box label string, if not provided will not be added to result image (default is None) - - Returns - ------- - np.ndarray - NumPy array including both image and detection box - - """ - ymin, xmin, ymax, xmax = box - point1, point2 = (int(xmin), int(ymin)), (int(xmax), int(ymax)) - box_color = [np.random.randint(0, 255) for _ in range(3)] - line_thickness = round(0.002 * (image.shape[0] + image.shape[1]) / 2) + 1 - - cv2.rectangle( - img=image, - pt1=point1, - pt2=point2, - color=box_color, - thickness=line_thickness, - lineType=cv2.LINE_AA, - ) - - if label: - font_thickness = max(line_thickness - 1, 1) - font_face = 0 - font_scale = line_thickness / 3 - font_color = (255, 255, 255) - text_size = cv2.getTextSize( - text=label, - fontFace=font_face, - fontScale=font_scale, - thickness=font_thickness, - )[0] - # Calculate rectangle coordinates - rectangle_point1 = point1 - rectangle_point2 = (point1[0] + text_size[0], point1[1] - text_size[1] - 3) - # Add filled rectangle - cv2.rectangle( - img=image, - pt1=rectangle_point1, - pt2=rectangle_point2, - color=box_color, - thickness=-1, - lineType=cv2.LINE_AA, - ) - # Calculate text position - text_position = point1[0], point1[1] - 3 - # Add text with label to filled rectangle - cv2.putText( - img=image, - text=label, - org=text_position, - fontFace=font_face, - fontScale=font_scale, - color=font_color, - thickness=font_thickness, - lineType=cv2.LINE_AA, - ) - return image - -.. code:: ipython3 - - from typing import Dict - - from openvino.runtime.utils.data_helpers import OVDict - - - def visualize_inference_result( - inference_result: OVDict, - image: np.ndarray, - labels_map: Dict, - detections_limit: Optional[int] = None, - ): - """ - Helper function for visualizing inference result on the image - - Parameters - ---------- - inference_result : OVDict - Result of the compiled model inference on the test image - image : np.ndarray - Original image to use for visualization - labels_map : Dict - Dictionary with mappings of detection classes numbers and its names - detections_limit : int, optional - Number of detections to show on the image, if not provided all detections will be shown (default is None) - """ - detection_boxes: np.ndarray = inference_result.get("detection_boxes") - detection_classes: np.ndarray = inference_result.get("detection_classes") - detection_scores: np.ndarray = inference_result.get("detection_scores") - num_detections: np.ndarray = inference_result.get("num_detections") - - detections_limit = int(min(detections_limit, num_detections[0]) if detections_limit is not None else num_detections[0]) - - # Normalize detection boxes coordinates to original image size - original_image_height, original_image_width, _ = image.shape - normalized_detection_boxex = detection_boxes[::] * [ - original_image_height, - original_image_width, - original_image_height, - original_image_width, - ] - - image_with_detection_boxex = np.copy(image) - - for i in range(detections_limit): - detected_class_name = labels_map[int(detection_classes[0, i])] - score = detection_scores[0, i] - label = f"{detected_class_name} {score:.2f}" - add_detection_box( - box=normalized_detection_boxex[0, i], - image=image_with_detection_boxex, - label=label, - ) - - plt.imshow(image_with_detection_boxex) - -TensorFlow Object Detection model -(`faster_rcnn_resnet50_v1_640x640 `__) -used in this notebook was trained on `COCO -2017 `__ dataset with 91 classes. For better -visualization experience we can use COCO dataset labels with human -readable class names instead of class numbers or indexes. - -We can download COCO dataset classes labels from `Open Model -Zoo `__: - -.. code:: ipython3 - - coco_labels_file_path = Path("./data/coco_91cl.txt") - - download_file( - url="https://raw.githubusercontent.com/openvinotoolkit/open_model_zoo/master/data/dataset_classes/coco_91cl.txt", - filename=coco_labels_file_path.name, - directory=coco_labels_file_path.parent, - ) - - - -.. parsed-literal:: - - data/coco_91cl.txt: 0%| | 0.00/421 [00:00`__. - -Integration preprocessing to model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Preprocessing API enables making preprocessing a part of the model -reducing application code and dependency on additional image processing -libraries. The main advantage of Preprocessing API is that preprocessing -steps will be integrated into the execution graph and will be performed -on a selected device (CPU/GPU etc.) rather than always being executed on -CPU as part of an application. This will improve selected device -utilization. - -For more information, refer to the `Optimize Preprocessing -tutorial `__ and -to the overview of `Preprocessing -API `__. diff --git a/docs/notebooks/tensorflow-object-detection-to-openvino-with-output_files/tensorflow-object-detection-to-openvino-with-output_25_1.png b/docs/notebooks/tensorflow-object-detection-to-openvino-with-output_files/tensorflow-object-detection-to-openvino-with-output_25_1.png deleted file mode 100644 index 8e3d83c55a3716..00000000000000 --- a/docs/notebooks/tensorflow-object-detection-to-openvino-with-output_files/tensorflow-object-detection-to-openvino-with-output_25_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:40be1f179e0fa9db9b42a3dcf241f8d180b1baf3f2b3ffc1f6bb0f5daec1c133 -size 395346 diff --git a/docs/notebooks/tensorflow-object-detection-to-openvino-with-output_files/tensorflow-object-detection-to-openvino-with-output_38_0.png b/docs/notebooks/tensorflow-object-detection-to-openvino-with-output_files/tensorflow-object-detection-to-openvino-with-output_38_0.png deleted file mode 100644 index 63fba9108b8497..00000000000000 --- a/docs/notebooks/tensorflow-object-detection-to-openvino-with-output_files/tensorflow-object-detection-to-openvino-with-output_38_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7ee879b002fbf3b42866784eb7a3cdc5bcef8068cc07eb7a679b474b67be04fa -size 391809 diff --git a/docs/notebooks/tensorflow-quantization-aware-training-with-output.rst b/docs/notebooks/tensorflow-quantization-aware-training-with-output.rst deleted file mode 100644 index 1df4fb38e0c43c..00000000000000 --- a/docs/notebooks/tensorflow-quantization-aware-training-with-output.rst +++ /dev/null @@ -1,531 +0,0 @@ -Quantization Aware Training with NNCF, using TensorFlow Framework -================================================================= - -The goal of this notebook to demonstrate how to use the Neural Network -Compression Framework `NNCF `__ -8-bit quantization to optimize a TensorFlow model for inference with -OpenVINO™ Toolkit. The optimization process contains the following -steps: - -- Transforming the original ``FP32`` model to ``INT8`` -- Using fine-tuning to restore the accuracy. -- Exporting optimized and original models to Frozen Graph and then to - OpenVINO. -- Measuring and comparing the performance of models. - -For more advanced usage, refer to these -`examples `__. - -This tutorial uses the ResNet-18 model with Imagenette dataset. -Imagenette is a subset of 10 easily classified classes from the ImageNet -dataset. Using the smaller model and dataset will speed up training and -download time. - - -**Table of contents:** - - -- `Imports and Settings <#imports-and-settings>`__ -- `Dataset Preprocessing <#dataset-preprocessing>`__ -- `Define a Floating-Point Model <#define-a-floating-point-model>`__ -- `Pre-train a Floating-Point - Model <#pre-train-a-floating-point-model>`__ -- `Create and Initialize - Quantization <#create-and-initialize-quantization>`__ -- `Fine-tune the Compressed Model <#fine-tune-the-compressed-model>`__ -- `Export Models to OpenVINO Intermediate Representation - (IR) <#export-models-to-openvino-intermediate-representation-ir>`__ -- `Benchmark Model Performance by Computing Inference - Time <#benchmark-model-performance-by-computing-inference-time>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Imports and Settings --------------------- - - - -Import NNCF and all auxiliary packages from your Python code. Set a name -for the model, input image size, used batch size, and the learning rate. -Also, define paths where Frozen Graph and OpenVINO IR versions of the -models will be stored. - - **NOTE**: All NNCF logging messages below ERROR level (INFO and - WARNING) are disabled to simplify the tutorial. For production use, - it is recommended to enable logging by removing - ``set_log_level(logging.ERROR)``. - -.. code:: ipython3 - - %pip install -q "openvino>=2024.6.0" "nncf>=2.14.0" - %pip install -q "tensorflow-macos>=2.9.3,<2.16.0; sys_platform == 'darwin' and platform_machine == 'arm64'" - %pip install -q "tensorflow>=2.9.3,<2.16.0; sys_platform == 'darwin' and platform_machine != 'arm64'" # macOS x86 - %pip install -q "tensorflow>=2.9.3,<2.16.0; sys_platform != 'darwin'" - %pip install -q "tensorflow-datasets>=4.9.0,<4.9.3; platform_system=='Windows'" - %pip install -q "tensorflow-datasets>=4.9.0" - -.. code:: ipython3 - - from pathlib import Path - import logging - - import tensorflow as tf - import tensorflow_datasets as tfds - - from nncf import NNCFConfig - from nncf.tensorflow.helpers.model_creation import create_compressed_model - from nncf.tensorflow.initialization import register_default_init_args - from nncf.common.logging.logger import set_log_level - import openvino as ov - - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("tensorflow-quantization-aware-training.ipynb") - - set_log_level(logging.ERROR) - - MODEL_DIR = Path("model") - OUTPUT_DIR = Path("output") - MODEL_DIR.mkdir(exist_ok=True) - OUTPUT_DIR.mkdir(exist_ok=True) - - BASE_MODEL_NAME = "ResNet-18" - - fp32_h5_path = Path(MODEL_DIR / (BASE_MODEL_NAME + "_fp32")).with_suffix(".h5") - fp32_ir_path = Path(OUTPUT_DIR / "saved_model").with_suffix(".xml") - int8_pb_path = Path(OUTPUT_DIR / (BASE_MODEL_NAME + "_int8")).with_suffix(".pb") - int8_ir_path = int8_pb_path.with_suffix(".xml") - - BATCH_SIZE = 128 - IMG_SIZE = (64, 64) # Default Imagenet image size - NUM_CLASSES = 10 # For Imagenette dataset - - LR = 1e-5 - - MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255) # From Imagenet dataset - STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255) # From Imagenet dataset - - fp32_pth_url = "https://storage.openvinotoolkit.org/repositories/nncf/openvino_notebook_ckpts/305_resnet18_imagenette_fp32_v1.h5" - _ = tf.keras.utils.get_file(fp32_h5_path.resolve(), fp32_pth_url) - print(f"Absolute path where the model weights are saved:\n {fp32_h5_path.resolve()}") - -Dataset Preprocessing ---------------------- - - - -Download and prepare Imagenette 160px dataset. - -- Number of classes: 10 -- Download size: 94.18 MiB - -:: - - | Split | Examples | - |--------------|----------| - | 'train' | 12,894 | - | 'validation' | 500 | - -.. code:: ipython3 - - datasets, datasets_info = tfds.load( - "imagenette/160px", - shuffle_files=True, - as_supervised=True, - with_info=True, - read_config=tfds.ReadConfig(shuffle_seed=0), - ) - train_dataset, validation_dataset = datasets["train"], datasets["validation"] - fig = tfds.show_examples(train_dataset, datasets_info) - - -.. parsed-literal:: - - 2024-03-25 10:42:57.451466: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2211] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform. - Skipping registering GPU devices... - 2024-03-25 10:42:57.665497: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead. - - - -.. image:: tensorflow-quantization-aware-training-with-output_files/tensorflow-quantization-aware-training-with-output_6_1.png - - -.. code:: ipython3 - - def preprocessing(image, label): - image = tf.image.resize(image, IMG_SIZE) - image = image - MEAN_RGB - image = image / STDDEV_RGB - label = tf.one_hot(label, NUM_CLASSES) - return image, label - - - train_dataset = train_dataset.map(preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE) - - validation_dataset = ( - validation_dataset.map(preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE) - ) - -Define a Floating-Point Model ------------------------------ - - - -.. code:: ipython3 - - def residual_conv_block(filters, stage, block, strides=(1, 1), cut="pre"): - def layer(input_tensor): - x = tf.keras.layers.BatchNormalization(epsilon=2e-5)(input_tensor) - x = tf.keras.layers.Activation("relu")(x) - - # Defining shortcut connection. - if cut == "pre": - shortcut = input_tensor - elif cut == "post": - shortcut = tf.keras.layers.Conv2D( - filters, - (1, 1), - strides=strides, - kernel_initializer="he_uniform", - use_bias=False, - )(x) - - # Continue with convolution layers. - x = tf.keras.layers.ZeroPadding2D(padding=(1, 1))(x) - x = tf.keras.layers.Conv2D( - filters, - (3, 3), - strides=strides, - kernel_initializer="he_uniform", - use_bias=False, - )(x) - - x = tf.keras.layers.BatchNormalization(epsilon=2e-5)(x) - x = tf.keras.layers.Activation("relu")(x) - x = tf.keras.layers.ZeroPadding2D(padding=(1, 1))(x) - x = tf.keras.layers.Conv2D(filters, (3, 3), kernel_initializer="he_uniform", use_bias=False)(x) - - # Add residual connection. - x = tf.keras.layers.Add()([x, shortcut]) - return x - - return layer - - - def ResNet18(input_shape=None): - """Instantiates the ResNet18 architecture.""" - img_input = tf.keras.layers.Input(shape=input_shape, name="data") - - # ResNet18 bottom - x = tf.keras.layers.BatchNormalization(epsilon=2e-5, scale=False)(img_input) - x = tf.keras.layers.ZeroPadding2D(padding=(3, 3))(x) - x = tf.keras.layers.Conv2D(64, (7, 7), strides=(2, 2), kernel_initializer="he_uniform", use_bias=False)(x) - x = tf.keras.layers.BatchNormalization(epsilon=2e-5)(x) - x = tf.keras.layers.Activation("relu")(x) - x = tf.keras.layers.ZeroPadding2D(padding=(1, 1))(x) - x = tf.keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding="valid")(x) - - # ResNet18 body - repetitions = (2, 2, 2, 2) - for stage, rep in enumerate(repetitions): - for block in range(rep): - filters = 64 * (2**stage) - if block == 0 and stage == 0: - x = residual_conv_block(filters, stage, block, strides=(1, 1), cut="post")(x) - elif block == 0: - x = residual_conv_block(filters, stage, block, strides=(2, 2), cut="post")(x) - else: - x = residual_conv_block(filters, stage, block, strides=(1, 1), cut="pre")(x) - x = tf.keras.layers.BatchNormalization(epsilon=2e-5)(x) - x = tf.keras.layers.Activation("relu")(x) - - # ResNet18 top - x = tf.keras.layers.GlobalAveragePooling2D()(x) - x = tf.keras.layers.Dense(NUM_CLASSES)(x) - x = tf.keras.layers.Activation("softmax")(x) - - # Create the model. - model = tf.keras.models.Model(img_input, x) - - return model - -.. code:: ipython3 - - IMG_SHAPE = IMG_SIZE + (3,) - fp32_model = ResNet18(input_shape=IMG_SHAPE) - -Pre-train a Floating-Point Model --------------------------------- - - - -Using NNCF for model compression assumes that the user has a pre-trained -model and a training pipeline. - - **NOTE** For the sake of simplicity of the tutorial, it is - recommended to skip ``FP32`` model training and load the weights that - are provided. - -.. code:: ipython3 - - # Load the floating-point weights. - fp32_model.load_weights(fp32_h5_path) - - # Compile the floating-point model. - fp32_model.compile( - loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1), - metrics=[tf.keras.metrics.CategoricalAccuracy(name="acc@1")], - ) - - # Validate the floating-point model. - test_loss, acc_fp32 = fp32_model.evaluate( - validation_dataset, - callbacks=tf.keras.callbacks.ProgbarLogger(stateful_metrics=["acc@1"]), - ) - print(f"\nAccuracy of FP32 model: {acc_fp32:.3f}") - - -.. parsed-literal:: - - 4/4 [==============================] - 1s 187ms/sample - loss: 0.9807 - acc@1: 0.8220 - - Accuracy of FP32 model: 0.822 - - -Create and Initialize Quantization ----------------------------------- - - - -NNCF enables compression-aware training by integrating into regular -training pipelines. The framework is designed so that modifications to -your original training code are minor. Quantization is the simplest -scenario and requires only 3 modifications. - -1. Configure NNCF parameters to specify compression - -.. code:: ipython3 - - nncf_config_dict = { - "input_info": {"sample_size": [1, 3] + list(IMG_SIZE)}, - "log_dir": str(OUTPUT_DIR), # The log directory for NNCF-specific logging outputs. - "compression": { - "algorithm": "quantization", # Specify the algorithm here. - }, - } - nncf_config = NNCFConfig.from_dict(nncf_config_dict) - -2. Provide a data loader to initialize the values of quantization ranges - and determine which activation should be signed or unsigned from the - collected statistics, using a given number of samples. - -.. code:: ipython3 - - nncf_config = register_default_init_args(nncf_config=nncf_config, data_loader=train_dataset, batch_size=BATCH_SIZE) - -3. Create a wrapped model ready for compression fine-tuning from a - pre-trained ``FP32`` model and a configuration object. - -.. code:: ipython3 - - compression_ctrl, int8_model = create_compressed_model(fp32_model, nncf_config) - -Evaluate the new model on the validation set after initialization of -quantization. The accuracy should be not far from the accuracy of the -floating-point ``FP32`` model for a simple case like the one being -demonstrated here. - -.. code:: ipython3 - - # Compile the INT8 model. - int8_model.compile( - optimizer=tf.keras.optimizers.Adam(learning_rate=LR), - loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1), - metrics=[tf.keras.metrics.CategoricalAccuracy(name="acc@1")], - ) - - # Validate the INT8 model. - test_loss, test_acc = int8_model.evaluate( - validation_dataset, - callbacks=tf.keras.callbacks.ProgbarLogger(stateful_metrics=["acc@1"]), - ) - - -.. parsed-literal:: - - 4/4 [==============================] - 1s 332ms/sample - loss: 0.9771 - acc@1: 0.8060 - - -Fine-tune the Compressed Model ------------------------------- - - - -At this step, a regular fine-tuning process is applied to further -improve quantized model accuracy. Normally, several epochs of tuning are -required with a small learning rate, the same that is usually used at -the end of the training of the original model. No other changes in the -training pipeline are required. Here is a simple example. - -.. code:: ipython3 - - print(f"\nAccuracy of INT8 model after initialization: {test_acc:.3f}") - - # Train the INT8 model. - int8_model.fit(train_dataset, epochs=2) - - # Validate the INT8 model. - test_loss, acc_int8 = int8_model.evaluate( - validation_dataset, - callbacks=tf.keras.callbacks.ProgbarLogger(stateful_metrics=["acc@1"]), - ) - print(f"\nAccuracy of INT8 model after fine-tuning: {acc_int8:.3f}") - print(f"\nAccuracy drop of tuned INT8 model over pre-trained FP32 model: {acc_fp32 - acc_int8:.3f}") - - -.. parsed-literal:: - - - Accuracy of INT8 model after initialization: 0.806 - Epoch 1/2 - 101/101 [==============================] - 40s 292ms/step - loss: 0.7138 - acc@1: 0.9302 - Epoch 2/2 - 101/101 [==============================] - 27s 269ms/step - loss: 0.6803 - acc@1: 0.9499 - 4/4 [==============================] - 0s 78ms/sample - loss: 0.9757 - acc@1: 0.8100 - - Accuracy of INT8 model after fine-tuning: 0.810 - - Accuracy drop of tuned INT8 model over pre-trained FP32 model: 0.012 - - -Export Models to OpenVINO Intermediate Representation (IR) ----------------------------------------------------------- - - - -Use model conversion Python API to convert the models to OpenVINO IR. - -For more information about model conversion, see this -`page `__. - -Executing this command may take a while. - -.. code:: ipython3 - - model_ir_fp32 = ov.convert_model(fp32_model) - -.. code:: ipython3 - - model_ir_int8 = ov.convert_model(int8_model) - -.. code:: ipython3 - - ov.save_model(model_ir_fp32, fp32_ir_path, compress_to_fp16=False) - ov.save_model(model_ir_int8, int8_ir_path, compress_to_fp16=False) - -Benchmark Model Performance by Computing Inference Time -------------------------------------------------------- - - - -Finally, measure the inference performance of the ``FP32`` and ``INT8`` -models, using `Benchmark -Tool `__ -- an inference performance measurement tool in OpenVINO. By default, -Benchmark Tool runs inference for 60 seconds in asynchronous mode on -CPU. It returns inference speed as latency (milliseconds per image) and -throughput (frames per second) values. - - **NOTE**: This notebook runs ``benchmark_app`` for 15 seconds to give - a quick indication of performance. For more accurate performance, it - is recommended to run ``benchmark_app`` in a terminal/command prompt - after closing other applications. Run - ``benchmark_app -m model.xml -d CPU`` to benchmark async inference on - CPU for one minute. Change CPU to GPU to benchmark on GPU. Run - ``benchmark_app --help`` to see an overview of all command-line - options. - -Please select a benchmarking device using the dropdown list: - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'GPU.0', 'GPU.1'), value='CPU') - - - -.. code:: ipython3 - - def parse_benchmark_output(benchmark_output): - parsed_output = [line for line in benchmark_output if "FPS" in line] - print(*parsed_output, sep="\n") - - - print("Benchmark FP32 model (IR)") - benchmark_output = ! benchmark_app -m $fp32_ir_path -d $device.value -api async -t 15 -shape [1,64,64,3] - parse_benchmark_output(benchmark_output) - - print("\nBenchmark INT8 model (IR)") - benchmark_output = ! benchmark_app -m $int8_ir_path -d $device.value -api async -t 15 -shape [1,64,64,3] - parse_benchmark_output(benchmark_output) - - -.. parsed-literal:: - - Benchmark FP32 model (IR) - [ INFO ] Throughput: 3091.93 FPS - - Benchmark INT8 model (IR) - [ INFO ] Throughput: 12826.50 FPS - - -Show Device Information for reference. - -.. code:: ipython3 - - import openvino.properties as props - - - core = ov.Core() - core.get_property(device.value, props.device.full_name) - - - - -.. parsed-literal:: - - 'Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz' - - - -.. code:: ipython3 - - # Cleanup - # %pip uninstall -q -y tensorflow tf_keras diff --git a/docs/notebooks/tensorflow-quantization-aware-training-with-output_files/tensorflow-quantization-aware-training-with-output_6_1.png b/docs/notebooks/tensorflow-quantization-aware-training-with-output_files/tensorflow-quantization-aware-training-with-output_6_1.png deleted file mode 100644 index b7f901ca00e522..00000000000000 --- a/docs/notebooks/tensorflow-quantization-aware-training-with-output_files/tensorflow-quantization-aware-training-with-output_6_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:98967be1ba2ecb59f9308b03330feefbc0c92d2b5b366d11d16deb8cdc727c69 -size 518753 diff --git a/docs/notebooks/text-to-image-genai-with-output.rst b/docs/notebooks/text-to-image-genai-with-output.rst deleted file mode 100644 index bd7c06cbc91961..00000000000000 --- a/docs/notebooks/text-to-image-genai-with-output.rst +++ /dev/null @@ -1,320 +0,0 @@ -Text to Image pipeline and OpenVINO with Generate API -===================================================== - -`OpenVINO™ GenAI `__ -is a library of the most popular Generative AI model pipelines, -optimized execution methods, and samples that run on top of highly -performant OpenVINO Runtime. - -This library is friendly to PC and laptop execution, and optimized for -resource consumption. It requires no external dependencies to run -generative models as it already includes all the core functionality -(e.g. tokenization via openvino-tokenizers). - -In this notebook we will demonstrate how to use text to image models -like Stable Diffusion 1.5, 2.1, LCM using `Dreamlike Anime -1.0 `__ as an -example. All it takes is two steps: 1. Export OpenVINO IR format model -using the `Hugging Face -Optimum `__ library -accelerated by OpenVINO integration. The Hugging Face Optimum Intel API -is a high-level API that enables us to convert and quantize models from -the Hugging Face Transformers library to the OpenVINO™ IR format. For -more details, refer to the `Hugging Face Optimum Intel -documentation `__. -2. Run inference using the `Text-to-Image Generation -pipeline `__ -from OpenVINO GenAI. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Convert model using Optimum-CLI - tool <#convert-model-using-optimum-cli-tool>`__ -- `Run inference OpenVINO model with - Text2ImagePipeline <#run-inference-openvino-model-with-text2imagepipeline>`__ -- `Run inference OpenVINO model with Text2ImagePipeline with optional - LoRA - adapters <#run-inference-openvino-model-with-text2imagepipeline-with-optional-lora-adapters>`__ -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import platform - import requests - from pathlib import Path - - - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" - - %pip install -q -U --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly "openvino>=2024.5" "openvino-tokenizers>=2024.5" "openvino-genai>=2024.5" - %pip install -q -U "transformers>=4.45" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q Pillow "diffusers>=0.30.3" "gradio>=4.19" "typing_extensions>=4.9" "tqdm" - if platform.system() == "Darwin": - %pip install -q "numpy<2.0.0" - - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("text-to-image-genai.ipynb") - -Convert model using Optimum-CLI tool ------------------------------------- - - - -`Optimum Intel `__ -is the interface between the -`Transformers `__ and -`Diffusers `__ libraries -and OpenVINO to accelerate end-to-end pipelines on Intel architectures. -It provides ease-to-use cli interface for exporting models to `OpenVINO -Intermediate Representation -(IR) `__ -format. - -The command bellow demonstrates basic command for model export with -``optimum-cli`` - -.. code:: bash - - optimum-cli export openvino --model --task - -where ``--model`` argument is model id from HuggingFace Hub or local -directory with model (saved using ``.save_pretrained`` method), -``--task`` is one of `supported -task `__ -that exported model should solve. For image generation models, -``text-to-image`` should be used. If model initialization requires to -use remote code, ``--trust-remote-code`` flag additionally should be -passed. You can also apply fp16, 8-bit or 4-bit weight compression on -the Linear, Convolutional and Embedding layers when exporting your model -with the CLI by setting ``--weight-format`` to respectively fp16, int8 -or int4. This type of optimization allows to reduce the memory footprint -and inference latency. - -We will use ``optimum_cli`` from our helper ``cmd_helper.py`` that is a -wrapper over cli-command. - -.. code:: ipython3 - - from pathlib import Path - - from cmd_helper import optimum_cli - - - model_dir = Path("dreamlike_anime_1_0_ov") - - if not model_dir.exists(): - optimum_cli("dreamlike-art/dreamlike-anime-1.0", model_dir) - -Run inference OpenVINO model with Text2ImagePipeline ----------------------------------------------------- - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - - device = device_widget("CPU", exclude=["NPU"]) - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -And now just provide ``model_dir`` and the chosen inference device to -``openvino_genai.Text2ImagePipeline`` and call ``generate`` method for -inference. For results reproducibility we will use -``ov_genai.TorchGenerator``, pseudo-random numbers generator, which -behavior aligned with PyTorch Generator. That’s it:) - -.. code:: ipython3 - - import openvino_genai as ov_genai - from PIL import Image - from tqdm.notebook import tqdm - import sys - - num_inference_steps = 20 - - random_generator = ov_genai.TorchGenerator(42) - - pipe = ov_genai.Text2ImagePipeline(model_dir, device.value) - prompt = "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" - - pbar = tqdm(total=num_inference_steps) - - - def callback(step, num_steps, latent): - pbar.update(1) - sys.stdout.flush() - return False - - - image_tensor = pipe.generate(prompt, width=512, height=512, num_inference_steps=20, num_images_per_prompt=1, generator=random_generator, callback=callback) - - pbar.close() - - image = Image.fromarray(image_tensor.data[0]) - - - -.. parsed-literal:: - - 0%| | 0/20 [00:00`__ or -`HuggingFace `__ or trained by the user. -Adapters compatible with a base model should be used only. A weighted -blend of multiple adapters can be applied by specifying multiple adapter -files with corresponding alpha parameters in command line. Check -``lora.cpp`` source code to learn how to enable adapters and specify -them in each ``generate`` call. - -Here is an example how to run the sample with a single adapter. First -download adapter file from https://civitai.com/models/67927/soulcard -page manually and save it as ``soulcard.safetensors``. Or download it -from command line: - -.. code:: ipython3 - - if not Path("soulcard.safetensors").exists(): - r = requests.get( - url="https://civitai.com/api/download/models/72591", - ) - with open("soulcard.safetensors", "wb") as file: - file.write(r.content) - -.. code:: ipython3 - - def prepare_adapter_config(adapters): - adapter_config = ov_genai.AdapterConfig() - - # Multiple LoRA adapters applied simultaneously are supported, parse them all and corresponding alphas from cmd parameters: - for i in range(int(len(adapters) / 2)): - adapter = ov_genai.Adapter(adapters[2 * i]) - alpha = float(adapters[2 * i + 1]) - adapter_config.add(adapter, alpha) - - return adapter_config - - - adapter_config = prepare_adapter_config(["soulcard.safetensors", 0.5]) - - pipe = ov_genai.Text2ImagePipeline(model_dir, device.value, adapters=adapter_config) - - pbar = tqdm(total=num_inference_steps) - - image_tensor = pipe.generate(prompt, generator=ov_genai.TorchGenerator(42), width=512, height=512, num_inference_steps=20, callback=callback) - image = Image.fromarray(image_tensor.data[0]) - pbar.close() - - - -.. parsed-literal:: - - 0%| | 0/20 [00:00`__ - -Interactive demo ----------------- - - - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get("https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/text-to-image-genai/gradio_helper.py") - with open("gradio_helper.py", "w") as f: - f.write(r.text) - -.. code:: ipython3 - - from gradio_helper import make_demo - - - demo = make_demo(pipe, ov_genai.TorchGenerator, adapter_config) - - try: - demo.launch(debug=True) - except Exception: - demo.launch(share=True, debug=True) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/text-to-image-genai-with-output_files/text-to-image-genai-with-output_13_0.jpg b/docs/notebooks/text-to-image-genai-with-output_files/text-to-image-genai-with-output_13_0.jpg deleted file mode 100644 index 572f61b419feda..00000000000000 --- a/docs/notebooks/text-to-image-genai-with-output_files/text-to-image-genai-with-output_13_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:925683065dd251ea222992cbd6acf71f59c74d35a45c1b3f0ad8a1b88385bb90 -size 37393 diff --git a/docs/notebooks/text-to-image-genai-with-output_files/text-to-image-genai-with-output_13_0.png b/docs/notebooks/text-to-image-genai-with-output_files/text-to-image-genai-with-output_13_0.png deleted file mode 100644 index 994e0d4f0a20f0..00000000000000 --- a/docs/notebooks/text-to-image-genai-with-output_files/text-to-image-genai-with-output_13_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2cd516c5cd7b01d68bd755d384abdbadee2b8553421d28df2128e29f3e680da2 -size 391132 diff --git a/docs/notebooks/text-to-image-genai-with-output_files/text-to-image-genai-with-output_9_0.jpg b/docs/notebooks/text-to-image-genai-with-output_files/text-to-image-genai-with-output_9_0.jpg deleted file mode 100644 index 9812e0dd5cd2b7..00000000000000 --- a/docs/notebooks/text-to-image-genai-with-output_files/text-to-image-genai-with-output_9_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e802ce3f91b98135e5f510046cc9a9ff80cb00d67b9e041b568241beecb79f76 -size 52161 diff --git a/docs/notebooks/text-to-image-genai-with-output_files/text-to-image-genai-with-output_9_0.png b/docs/notebooks/text-to-image-genai-with-output_files/text-to-image-genai-with-output_9_0.png deleted file mode 100644 index 47eef2d572b76f..00000000000000 --- a/docs/notebooks/text-to-image-genai-with-output_files/text-to-image-genai-with-output_9_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1d4cc05dba15db71b79ae4b216d5e6571f2cccc61fb42f9664343da0197d30d0 -size 481254 diff --git a/docs/notebooks/tflite-selfie-segmentation-with-output.rst b/docs/notebooks/tflite-selfie-segmentation-with-output.rst deleted file mode 100644 index 74276fca7284f9..00000000000000 --- a/docs/notebooks/tflite-selfie-segmentation-with-output.rst +++ /dev/null @@ -1,621 +0,0 @@ -Selfie Segmentation using TFLite and OpenVINO -============================================= - -The Selfie segmentation pipeline allows developers to easily separate -the background from users within a scene and focus on what matters. -Adding cool effects to selfies or inserting your users into interesting -background environments has never been easier. Besides photo editing, -this technology is also important for video conferencing. It helps to -blur or replace the background during video calls. - -In this tutorial, we consider how to implement selfie segmentation using -OpenVINO. We will use `Multiclass Selfie-segmentation -model `__ -provided as part of `Google -MediaPipe `__ solution. - -The Multiclass Selfie-segmentation model is a multiclass semantic -segmentation model and classifies each pixel as background, hair, body, -face, clothes, and others (e.g. accessories). The model supports single -or multiple people in the frame, selfies, and full-body images. The -model is based on `Vision -Transformer `__ with customized -bottleneck and decoder architecture for real-time performance. More -details about the model can be found in `model -card `__. -This model is represented in Tensorflow Lite format. `TensorFlow -Lite `__, often referred to as -TFLite, is an open-source library developed for deploying machine -learning models to edge devices. - -The tutorial consists of following steps: - -1. Download the TFLite model and convert it to OpenVINO IR format. -2. Run inference on the image. -3. Run interactive background blurring demo on video. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ - - - `Install required dependencies <#install-required-dependencies>`__ - - `Download pretrained model and test - image <#download-pretrained-model-and-test-image>`__ - -- `Convert Tensorflow Lite model to OpenVINO IR - format <#convert-tensorflow-lite-model-to-openvino-ir-format>`__ -- `Run OpenVINO model inference on - image <#run-openvino-model-inference-on-image>`__ - - - `Load model <#load-model>`__ - - `Prepare input image <#prepare-input-image>`__ - - `Run model inference <#run-model-inference>`__ - - `Postprocess and visualize inference - results <#postprocess-and-visualize-inference-results>`__ - -- `Interactive background blurring demo on - video <#interactive-background-blurring-demo-on-video>`__ - - - `Run Live Background Blurring <#run-live-background-blurring>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Install required dependencies -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %pip install -q "openvino>=2023.1.0" "opencv-python" "tqdm" - %pip install -q "matplotlib>=3.4" - -.. code:: ipython3 - - import requests - from pathlib import Path - - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("tflite-selfie-segmentation.ipynb") - -Download pretrained model and test image -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - from notebook_utils import download_file, device_widget - - tflite_model_path = Path("selfie_multiclass_256x256.tflite") - tflite_model_url = "https://storage.googleapis.com/mediapipe-models/image_segmenter/selfie_multiclass_256x256/float32/latest/selfie_multiclass_256x256.tflite" - - download_file(tflite_model_url, tflite_model_path) - - -.. parsed-literal:: - - 'selfie_multiclass_256x256.tflite' already exists. - - - - -.. parsed-literal:: - - PosixPath('/home/ea/work/openvino_notebooks/notebooks/tflite-selfie-segmentation/selfie_multiclass_256x256.tflite') - - - -Convert Tensorflow Lite model to OpenVINO IR format ---------------------------------------------------- - - - -Starting from the 2023.0.0 release, OpenVINO supports TFLite model -conversion. However TFLite model format can be directly passed in -``read_model`` (you can find examples of this API usage for TFLite in -`TFLite to OpenVINO conversion -tutorial `__ and -tutorial with `basic OpenVINO API -capabilities `__), it is recommended -to convert model to OpenVINO Intermediate Representation format to apply -additional optimizations (e.g. weights compression to FP16 format). To -convert the TFLite model to OpenVINO IR, model conversion Python API can -be used. The ``ov.convert_model`` function accepts a path to the TFLite -model and returns the OpenVINO Model class instance which represents -this model. The obtained model is ready to use and to be loaded on the -device using ``compile_model`` or can be saved on a disk using the -``ov.save_model`` function reducing loading time for the next running. -For more information about model conversion, see this -`page `__. -For TensorFlow Lite, refer to the `models -support `__. - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - - ir_model_path = tflite_model_path.with_suffix(".xml") - - if not ir_model_path.exists(): - ov_model = ov.convert_model(tflite_model_path) - ov.save_model(ov_model, ir_model_path) - else: - ov_model = core.read_model(ir_model_path) - -.. code:: ipython3 - - print(f"Model input info: {ov_model.inputs}") - - -.. parsed-literal:: - - Model input info: [] - - -Model input is a floating point tensor with shape [1, 256, 256, 3] in -``N, H, W, C`` format, where - -- ``N`` - batch size, number of input images. -- ``H`` - the height of the input image. -- ``W`` - width of the input image. -- ``C`` - channels of the input image. - -The model accepts images in RGB format normalized in [0, 1] range by -division on 255. - -.. code:: ipython3 - - print(f"Model output info: {ov_model.outputs}") - - -.. parsed-literal:: - - Model output info: [] - - -Model output is a floating point tensor with the similar format and -shape, except number of channels - 6 that represents number of supported -segmentation classes: background, hair, body skin, face skin, clothes, -and others. Each value in the output tensor represents of probability -that the pixel belongs to the specified class. We can use the ``argmax`` -operation to get the label with the highest probability for each pixel. - -Run OpenVINO model inference on image -------------------------------------- - - - -Let’s see the model in action. For running the inference model with -OpenVINO we should load the model on the device first. Please use the -next dropdown list for the selection inference device. - -Load model -~~~~~~~~~~ - - - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - compiled_model = core.compile_model(ov_model, device.value) - -Prepare input image -~~~~~~~~~~~~~~~~~~~ - - - -The model accepts an image with size 256x256, we need to resize our -input image to fit it in the model input tensor. Usually, segmentation -models are sensitive to proportions of input image details, so -preserving the original aspect ratio and adding padding can help improve -segmentation accuracy, we will use this pre-processing approach. -Additionally, the input image is represented as an RGB image in UINT8 -([0, 255] data range), we should normalize it in [0, 1]. - -.. code:: ipython3 - - import cv2 - import numpy as np - from notebook_utils import load_image - - # Read input image and convert it to RGB - test_image_url = "https://user-images.githubusercontent.com/29454499/251036317-551a2399-303e-4a4a-a7d6-d7ce973e05c5.png" - image_name = "example-img.png" - - img = load_image(image_name, test_image_url) - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - - - # Preprocessing helper function - def resize_and_pad(image: np.ndarray, height: int = 256, width: int = 256): - """ - Input preprocessing function, takes input image in np.ndarray format, - resizes it to fit specified height and width with preserving aspect ratio - and adds padding on bottom or right side to complete target height x width rectangle. - - Parameters: - image (np.ndarray): input image in np.ndarray format - height (int, *optional*, 256): target height - width (int, *optional*, 256): target width - Returns: - padded_img (np.ndarray): processed image - padding_info (Tuple[int, int]): information about padding size, required for postprocessing - """ - h, w = image.shape[:2] - if h < w: - img = cv2.resize(image, (width, np.floor(h / (w / width)).astype(int))) - else: - img = cv2.resize(image, (np.floor(w / (h / height)).astype(int), height)) - - r_h, r_w = img.shape[:2] - right_padding = width - r_w - bottom_padding = height - r_h - padded_img = cv2.copyMakeBorder(img, 0, bottom_padding, 0, right_padding, cv2.BORDER_CONSTANT) - return padded_img, (bottom_padding, right_padding) - - - # Apply preprocessig step - resize and pad input image - padded_img, pad_info = resize_and_pad(np.array(img)) - - # Convert input data from uint8 [0, 255] to float32 [0, 1] range and add batch dimension - normalized_img = np.expand_dims(padded_img.astype(np.float32) / 255, 0) - -Run model inference -~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - out = compiled_model(normalized_img)[0] - -Postprocess and visualize inference results -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The model predicts segmentation probabilities mask with the size 256 x -256, we need to apply postprocessing to get labels with the highest -probability for each pixel and restore the result in the original input -image size. We can interpret the result of the model in different ways, -e.g. visualize the segmentation mask, apply some visual effects on the -selected background (remove, replace it with any other picture, blur it) -or other classes (for example, change the color of person’s hair or add -makeup). - -.. code:: ipython3 - - from typing import Tuple - from notebook_utils import segmentation_map_to_image, SegmentationMap, Label - - # helper for visualization segmentation labels - labels = [ - Label(index=0, color=(192, 192, 192), name="background"), - Label(index=1, color=(128, 0, 0), name="hair"), - Label(index=2, color=(255, 229, 204), name="body skin"), - Label(index=3, color=(255, 204, 204), name="face skin"), - Label(index=4, color=(0, 0, 128), name="clothes"), - Label(index=5, color=(128, 0, 128), name="others"), - ] - SegmentationLabels = SegmentationMap(labels) - - - # helper for postprocessing output mask - def postprocess_mask(out: np.ndarray, pad_info: Tuple[int, int], orig_img_size: Tuple[int, int]): - """ - Posptprocessing function for segmentation mask, accepts model output tensor, - gets labels for each pixel using argmax, - unpads segmentation mask and resizes it to original image size. - - Parameters: - out (np.ndarray): model output tensor - pad_info (Tuple[int, int]): information about padding size from preprocessing step - orig_img_size (Tuple[int, int]): original image height and width for resizing - Returns: - label_mask_resized (np.ndarray): postprocessed segmentation label mask - """ - label_mask = np.argmax(out, -1)[0] - pad_h, pad_w = pad_info - unpad_h = label_mask.shape[0] - pad_h - unpad_w = label_mask.shape[1] - pad_w - label_mask_unpadded = label_mask[:unpad_h, :unpad_w] - orig_h, orig_w = orig_img_size - label_mask_resized = cv2.resize(label_mask_unpadded, (orig_w, orig_h), interpolation=cv2.INTER_NEAREST) - return label_mask_resized - - - # Get info about original image - image_data = np.array(img) - orig_img_shape = image_data.shape - - # Specify background color for replacement - BG_COLOR = (192, 192, 192) - - # Blur image for backgraund blurring scenario using Gaussian Blur - blurred_image = cv2.GaussianBlur(image_data, (55, 55), 0) - - # Postprocess output - postprocessed_mask = postprocess_mask(out, pad_info, orig_img_shape[:2]) - - # Get colored segmentation map - output_mask = segmentation_map_to_image(postprocessed_mask, SegmentationLabels.get_colormap()) - - # Replace background on original image - # fill image with solid background color - bg_image = np.full(orig_img_shape, BG_COLOR, dtype=np.uint8) - - # define condition mask for separation background and foreground - condition = np.stack((postprocessed_mask,) * 3, axis=-1) > 0 - # replace background with solid color - output_image = np.where(condition, image_data, bg_image) - # replace background with blurred image copy - output_blurred_image = np.where(condition, image_data, blurred_image) - -Visualize obtained result - -.. code:: ipython3 - - import matplotlib.pyplot as plt - - titles = ["Original image", "Portrait mask", "Removed background", "Blurred background"] - images = [image_data, output_mask, output_image, output_blurred_image] - figsize = (16, 16) - fig, axs = plt.subplots(2, 2, figsize=figsize, sharex="all", sharey="all") - fig.patch.set_facecolor("white") - list_axes = list(axs.flat) - for i, a in enumerate(list_axes): - a.set_xticklabels([]) - a.set_yticklabels([]) - a.get_xaxis().set_visible(False) - a.get_yaxis().set_visible(False) - a.grid(False) - a.imshow(images[i].astype(np.uint8)) - a.set_title(titles[i]) - fig.subplots_adjust(wspace=0.0, hspace=-0.8) - fig.tight_layout() - - - -.. image:: tflite-selfie-segmentation-with-output_files/tflite-selfie-segmentation-with-output_25_0.png - - -Interactive background blurring demo on video ---------------------------------------------- - - - -The following code runs model inference on a video: - -.. code:: ipython3 - - import collections - import time - from IPython import display - from typing import Union - - from notebook_utils import VideoPlayer - - - # Main processing function to run background blurring - def run_background_blurring( - source: Union[str, int] = 0, - flip: bool = False, - use_popup: bool = False, - skip_first_frames: int = 0, - model: ov.Model = ov_model, - device: str = "CPU", - video_width: int = None, # if not set the original size is used - ): - """ - Function for running background blurring inference on video - Parameters: - source (Union[str, int], *optional*, 0): input video source, it can be path or link on video file or web camera id. - flip (bool, *optional*, False): flip output video, used for front-camera video processing - use_popup (bool, *optional*, False): use popup window for avoid flickering - skip_first_frames (int, *optional*, 0): specified number of frames will be skipped in video processing - model (ov.Model): OpenVINO model for inference - device (str): inference device - Returns: - None - """ - player = None - compiled_model = core.compile_model(model, device) - try: - # Create a video player to play with target fps. - player = VideoPlayer(source=source, flip=flip, fps=30, skip_first_frames=skip_first_frames) - # Start capturing. - player.start() - if use_popup: - title = "Press ESC to Exit" - cv2.namedWindow(winname=title, flags=cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE) - - processing_times = collections.deque() - while True: - # Grab the frame. - frame = player.next() - if frame is None: - print("Source ended") - break - - if video_width: - # If the frame is larger than video_width, reduce size to improve the performance. - # If more, increase size for better demo expirience. - scale = video_width / max(frame.shape) - frame = cv2.resize( - src=frame, - dsize=None, - fx=scale, - fy=scale, - interpolation=cv2.INTER_AREA, - ) - - # Get the results. - input_image, pad_info = resize_and_pad(frame, 256, 256) - normalized_img = np.expand_dims(input_image.astype(np.float32) / 255, 0) - - start_time = time.time() - # model expects RGB image, while video capturing in BGR - segmentation_mask = compiled_model(normalized_img[:, :, :, ::-1])[0] - stop_time = time.time() - blurred_image = cv2.GaussianBlur(frame, (55, 55), 0) - postprocessed_mask = postprocess_mask(segmentation_mask, pad_info, frame.shape[:2]) - condition = np.stack((postprocessed_mask,) * 3, axis=-1) > 0 - frame = np.where(condition, frame, blurred_image) - processing_times.append(stop_time - start_time) - # Use processing times from last 200 frames. - if len(processing_times) > 200: - processing_times.popleft() - - _, f_width = frame.shape[:2] - # Mean processing time [ms]. - processing_time = np.mean(processing_times) * 1000 - fps = 1000 / processing_time - cv2.putText( - img=frame, - text=f"Inference time: {processing_time:.1f}ms ({fps:.1f} FPS)", - org=(20, 40), - fontFace=cv2.FONT_HERSHEY_COMPLEX, - fontScale=f_width / 1000, - color=(255, 0, 0), - thickness=1, - lineType=cv2.LINE_AA, - ) - # Use this workaround if there is flickering. - if use_popup: - cv2.imshow(winname=title, mat=frame) - key = cv2.waitKey(1) - # escape = 27 - if key == 27: - break - else: - # Encode numpy array to jpg. - _, encoded_img = cv2.imencode(ext=".jpg", img=frame, params=[cv2.IMWRITE_JPEG_QUALITY, 100]) - # Create an IPython image. - i = display.Image(data=encoded_img) - # Display the image in this notebook. - display.clear_output(wait=True) - display.display(i) - # ctrl-c - except KeyboardInterrupt: - print("Interrupted") - # any different error - except RuntimeError as e: - print(e) - finally: - if player is not None: - # Stop capturing. - player.stop() - if use_popup: - cv2.destroyAllWindows() - -Run Live Background Blurring -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Use a webcam as the video input. By default, the primary webcam is set -with ``source=0``. If you have multiple webcams, each one will be -assigned a consecutive number starting at 0. Set ``flip=True`` when -using a front-facing camera. Some web browsers, especially Mozilla -Firefox, may cause flickering. If you experience flickering, -set ``use_popup=True``. - - **NOTE**: To use this notebook with a webcam, you need to run the - notebook on a computer with a webcam. If you run the notebook on a - remote server (for example, in Binder or Google Colab service), the - webcam will not work. By default, the lower cell will run model - inference on a video file. If you want to try to live inference on - your webcam set ``WEBCAM_INFERENCE = True`` - -.. code:: ipython3 - - from notebook_utils import download_file - - - WEBCAM_INFERENCE = False - - if WEBCAM_INFERENCE: - VIDEO_SOURCE = 0 # Webcam - else: - VIDEO_SOURCE = "CEO-Pat-Gelsinger-on-Leading-Intel.mp4" - download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/CEO%20Pat%20Gelsinger%20on%20Leading%20Intel.mp4", - VIDEO_SOURCE, - ) - -Select device for inference: - -.. code:: ipython3 - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -Run: - -.. code:: ipython3 - - run_background_blurring( - source=VIDEO_SOURCE, - device=device.value, - # video_width=1280 - ) - - - -.. image:: tflite-selfie-segmentation-with-output_files/tflite-selfie-segmentation-with-output_33_0.png - - -.. parsed-literal:: - - Source ended - diff --git a/docs/notebooks/tflite-selfie-segmentation-with-output_files/tflite-selfie-segmentation-with-output_25_0.png b/docs/notebooks/tflite-selfie-segmentation-with-output_files/tflite-selfie-segmentation-with-output_25_0.png deleted file mode 100644 index 84c4b4e4d02638..00000000000000 --- a/docs/notebooks/tflite-selfie-segmentation-with-output_files/tflite-selfie-segmentation-with-output_25_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:41a285eebb6f6f42b73d47fd6d08f942f3c09c1e8a50e2498ef8be0bfb488d95 -size 512563 diff --git a/docs/notebooks/tflite-selfie-segmentation-with-output_files/tflite-selfie-segmentation-with-output_33_0.png b/docs/notebooks/tflite-selfie-segmentation-with-output_files/tflite-selfie-segmentation-with-output_33_0.png deleted file mode 100644 index b7b1819179f52d..00000000000000 --- a/docs/notebooks/tflite-selfie-segmentation-with-output_files/tflite-selfie-segmentation-with-output_33_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ca0d32f3ccf8f28a874d429e90df0a8332e314885552beb95bb741341e75de1d -size 14231 diff --git a/docs/notebooks/tflite-to-openvino-with-output.rst b/docs/notebooks/tflite-to-openvino-with-output.rst deleted file mode 100644 index 371b6507cac3c3..00000000000000 --- a/docs/notebooks/tflite-to-openvino-with-output.rst +++ /dev/null @@ -1,239 +0,0 @@ -Convert a Tensorflow Lite Model to OpenVINO™ -============================================ - -`TensorFlow Lite `__, often -referred to as TFLite, is an open source library developed for deploying -machine learning models to edge devices. - -This short tutorial shows how to convert a TensorFlow Lite -`EfficientNet-Lite-B0 `__ -image classification model to OpenVINO `Intermediate -Representation `__ -(OpenVINO IR) format, using Model Converter. After creating the OpenVINO -IR, load the model in `OpenVINO -Runtime `__ -and do inference with a sample image. - - -**Table of contents:** - - -- `Preparation <#preparation>`__ - - - `Install requirements <#install-requirements>`__ - - `Imports <#imports>`__ - -- `Download TFLite model <#download-tflite-model>`__ -- `Convert a Model to OpenVINO IR - Format <#convert-a-model-to-openvino-ir-format>`__ -- `Load model using OpenVINO TensorFlow Lite - Frontend <#load-model-using-openvino-tensorflow-lite-frontend>`__ -- `Run OpenVINO model inference <#run-openvino-model-inference>`__ - - - `Select inference device <#select-inference-device>`__ - -- `Estimate Model Performance <#estimate-model-performance>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Preparation ------------ - - - -Install requirements -~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %pip install -q "openvino>=2023.1.0" - %pip install -q opencv-python requests tqdm kagglehub Pillow - - # Fetch `notebook_utils` module - import requests - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("tflite-to-openvino.ipynb") - -Imports -~~~~~~~ - - - -.. code:: ipython3 - - from pathlib import Path - import numpy as np - from PIL import Image - import openvino as ov - - from notebook_utils import download_file, load_image, device_widget - -Download TFLite model ---------------------- - - - -.. code:: ipython3 - - import kagglehub - - model_dir = kagglehub.model_download("tensorflow/efficientnet/tfLite/lite0-fp32") - tflite_model_path = Path(model_dir) / "2.tflite" - - ov_model_path = tflite_model_path.with_suffix(".xml") - -Convert a Model to OpenVINO IR Format -------------------------------------- - - - -To convert the TFLite model to OpenVINO IR, model conversion Python API -can be used. ``ov.convert_model`` function accepts the path to the -TFLite model and returns an OpenVINO Model class instance which -represents this model. The obtained model is ready to use and to be -loaded on a device using ``ov.compile_model`` or can be saved on a disk -using ``ov.save_model`` function, reducing loading time for next -running. By default, model weights are compressed to FP16 during -serialization by ``ov.save_model``. For more information about model -conversion, see this -`page `__. -For TensorFlow Lite models support, refer to this -`tutorial `__. - -.. code:: ipython3 - - ov_model = ov.convert_model(tflite_model_path) - ov.save_model(ov_model, ov_model_path) - print(f"Model {tflite_model_path} successfully converted and saved to {ov_model_path}") - - -.. parsed-literal:: - - Model model/efficientnet_lite0_fp32_2.tflite successfully converted and saved to model/efficientnet_lite0_fp32_2.xml - - -Load model using OpenVINO TensorFlow Lite Frontend --------------------------------------------------- - - - -TensorFlow Lite models are supported via ``FrontEnd`` API. You may skip -conversion to IR and read models directly by OpenVINO runtime API. For -more examples supported formats reading via Frontend API, please look -this `tutorial `__. - -.. code:: ipython3 - - core = ov.Core() - - ov_model = core.read_model(tflite_model_path) - -Run OpenVINO model inference ----------------------------- - - - -We can find information about model input preprocessing in its -`description `__ -on `TensorFlow Hub `__. - -.. code:: ipython3 - - image = load_image("coco_bricks.png", "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bricks.png") - # load_image reads the image in BGR format, [:,:,::-1] reshape transfroms it to RGB - image = Image.fromarray(image[:, :, ::-1]) - resized_image = image.resize((224, 224)) - input_tensor = np.expand_dims((np.array(resized_image).astype(np.float32) - 127) / 128, 0) - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - compiled_model = core.compile_model(ov_model, device.value) - predicted_scores = compiled_model(input_tensor)[0] - -.. code:: ipython3 - - imagenet_classes_file_path = download_file("https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/datasets/imagenet/imagenet_2012.txt") - imagenet_classes = open(imagenet_classes_file_path).read().splitlines() - - top1_predicted_cls_id = np.argmax(predicted_scores) - top1_predicted_score = predicted_scores[0][top1_predicted_cls_id] - predicted_label = imagenet_classes[top1_predicted_cls_id] - - display(image.resize((640, 512))) - print(f"Predicted label: {predicted_label} with probability {top1_predicted_score :2f}") - - -.. parsed-literal:: - - 'imagenet_2012.txt' already exists. - - - -.. image:: tflite-to-openvino-with-output_files/tflite-to-openvino-with-output_16_1.png - - -.. parsed-literal:: - - Predicted label: n02109047 Great Dane with probability 0.715318 - - -Estimate Model Performance --------------------------- - -`Benchmark -Tool `__ -is used to measure the inference performance of the model on CPU and -GPU. - - **NOTE**: For more accurate performance, it is recommended to run - ``benchmark_app`` in a terminal/command prompt after closing other - applications. Run ``benchmark_app -m model.xml -d CPU`` to benchmark - async inference on CPU for one minute. Change ``CPU`` to ``GPU`` to - benchmark on GPU. Run ``benchmark_app --help`` to see an overview of - all command-line options. - -.. code:: ipython3 - - print(f"Benchmark model inference on {device.value}") - !benchmark_app -m $ov_model_path -d $device.value -t 15 diff --git a/docs/notebooks/tflite-to-openvino-with-output_files/tflite-to-openvino-with-output_16_1.jpg b/docs/notebooks/tflite-to-openvino-with-output_files/tflite-to-openvino-with-output_16_1.jpg deleted file mode 100644 index 11da22e6b9ee07..00000000000000 --- a/docs/notebooks/tflite-to-openvino-with-output_files/tflite-to-openvino-with-output_16_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a11e349455a5b61c5f93ca1017f5f92b84c29282264891abfcfe160e72e0624c -size 68170 diff --git a/docs/notebooks/tflite-to-openvino-with-output_files/tflite-to-openvino-with-output_16_1.png b/docs/notebooks/tflite-to-openvino-with-output_files/tflite-to-openvino-with-output_16_1.png deleted file mode 100644 index 2b8d2ff268b3fb..00000000000000 --- a/docs/notebooks/tflite-to-openvino-with-output_files/tflite-to-openvino-with-output_16_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9000b06fe3a0537c489dc4c87fcb6ca0bae5eecd37ad736dc7e4c6e35f766ab8 -size 621006 diff --git a/docs/notebooks/tiny-sd-image-generation-with-output.rst b/docs/notebooks/tiny-sd-image-generation-with-output.rst deleted file mode 100644 index 51435c51cead25..00000000000000 --- a/docs/notebooks/tiny-sd-image-generation-with-output.rst +++ /dev/null @@ -1,1136 +0,0 @@ -Image Generation with Tiny-SD and OpenVINO™ -=========================================== - -In recent times, the AI community has witnessed a remarkable surge in -the development of larger and more performant language models, such as -Falcon 40B, LLaMa-2 70B, Falcon 40B, MPT 30B, and in the imaging domain -with models like SD2.1 and SDXL. These advancements have undoubtedly -pushed the boundaries of what AI can achieve, enabling highly versatile -and state-of-the-art image generation and language understanding -capabilities. However, the breakthrough of large models comes with -substantial computational demands. To resolve this issue, recent -research on efficient Stable Diffusion has prioritized reducing the -number of sampling steps and utilizing network quantization. - -Moving towards the goal of making image generative models faster, -smaller, and cheaper, Tiny-SD was proposed by Segmind. Tiny SD is a -compressed Stable Diffusion (SD) model that has been trained on -Knowledge-Distillation (KD) techniques and the work has been largely -based on this `paper `__. The -authors describe a Block-removal Knowledge-Distillation method where -some of the UNet layers are removed and the student model weights are -trained. Using the KD methods described in the paper, they were able to -train two compressed models using the 🧨 diffusers library; Small and -Tiny, that have 35% and 55% fewer parameters, respectively than the base -model while achieving comparable image fidelity as the base model. More -details about model can be found in `model -card `__, `blog -post `__ and training -`repository `__. - -This notebook demonstrates how to convert and run the Tiny-SD model -using OpenVINO. - -The notebook contains the following steps: - -1. Convert PyTorch models to OpenVINO Intermediate Representation using - OpenVINO Converter Tool (OVC). -2. Prepare Inference Pipeline. -3. Run Inference pipeline with OpenVINO. -4. Run Interactive demo for Tiny-SD model - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Create PyTorch Models pipeline <#create-pytorch-models-pipeline>`__ -- `Convert models to OpenVINO Intermediate representation - format <#convert-models-to-openvino-intermediate-representation-format>`__ - - - `Text Encoder <#text-encoder>`__ - - `U-net <#u-net>`__ - - `VAE <#vae>`__ - -- `Prepare Inference Pipeline <#prepare-inference-pipeline>`__ -- `Configure Inference Pipeline <#configure-inference-pipeline>`__ - - - `Calibrate UNet for GPU - inference <#calibrate-unet-for-gpu-inference>`__ - - `Text-to-Image generation <#text-to-image-generation>`__ - - `Image-to-Image generation <#image-to-image-generation>`__ - - `Interactive Demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Install required dependencies - -.. code:: ipython3 - - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "torch>=2.1" torchvision "openvino>=2023.3.0" "opencv-python" "pillow" "diffusers>=0.18.0" "transformers>=4.45" "gradio>=4.19" - -Create PyTorch Models pipeline ------------------------------- - - - -``StableDiffusionPipeline`` is an end-to-end inference pipeline that you -can use to generate images from text with just a few lines of code. - -First, load the pre-trained weights of all components of the model. - -.. code:: ipython3 - - import gc - from diffusers import StableDiffusionPipeline - import requests - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("tiny-sd-image-generation.ipynb") - - model_id = "segmind/tiny-sd" - - pipe = StableDiffusionPipeline.from_pretrained(model_id).to("cpu") - text_encoder = pipe.text_encoder - text_encoder.eval() - unet = pipe.unet - unet.eval() - vae = pipe.vae - vae.eval() - - del pipe - gc.collect() - -Convert models to OpenVINO Intermediate representation format -------------------------------------------------------------- - - - -OpenVINO supports PyTorch through conversion to OpenVINO Intermediate -Representation (IR) format. To take the advantage of OpenVINO -optimization tools and features, the model should be converted using the -OpenVINO Converter tool (OVC). The ``openvino.convert_model`` function -provides Python API for OVC usage. The function returns the instance of -the OpenVINO Model class, which is ready for use in the Python -interface. However, it can also be saved on disk using -``openvino.save_model`` for future execution. - -Starting from OpenVINO 2023.0.0 release OpenVINO supports direct -conversion PyTorch models. To perform conversion, we should provide -PyTorch model instance and example input into -``openvino.convert_model``. By default, model converted with dynamic -shapes preserving, in order to fixate input shape to generate image of -specific resolution, ``input`` parameter additionally can be specified. - -The model consists of three important parts: - -- Text Encoder for creation condition to generate image from text - prompt. -- U-net for step by step denoising latent image representation. -- Autoencoder (VAE) for encoding input image to latent space (if - required) and decoding latent space to image back after generation. - -Let us convert each part. - -Text Encoder -~~~~~~~~~~~~ - - - -The text-encoder is responsible for transforming the input prompt, for -example, “a photo of an astronaut riding a horse” into an embedding -space that can be understood by the U-Net. It is usually a simple -transformer-based encoder that maps a sequence of input tokens to a -sequence of latent text embeddings. - -Input of the text encoder is the tensor ``input_ids`` which contains -indexes of tokens from text processed by tokenizer and padded to maximum -length accepted by model. Model outputs are two tensors: -``last_hidden_state`` - hidden state from the last MultiHeadAttention -layer in the model and ``pooler_out`` - Pooled output for whole model -hidden states. - -.. code:: ipython3 - - from pathlib import Path - import torch - import openvino as ov - - TEXT_ENCODER_OV_PATH = Path("text_encoder.xml") - - - def convert_encoder(text_encoder: torch.nn.Module, ir_path: Path): - """ - Convert Text Encoder mode. - Function accepts text encoder model, and prepares example inputs for conversion, - Parameters: - text_encoder (torch.nn.Module): text_encoder model from Stable Diffusion pipeline - ir_path (Path): File for storing model - Returns: - None - """ - input_ids = torch.ones((1, 77), dtype=torch.long) - # switch model to inference mode - text_encoder.eval() - - # disable gradients calculation for reducing memory consumption - with torch.no_grad(): - # Export model to IR format - ov_model = ov.convert_model( - text_encoder, - example_input=input_ids, - input=[ - (1, 77), - ], - ) - ov.save_model(ov_model, ir_path) - del ov_model - print(f"Text Encoder successfully converted to IR and saved to {ir_path}") - - - if not TEXT_ENCODER_OV_PATH.exists(): - convert_encoder(text_encoder, TEXT_ENCODER_OV_PATH) - else: - print(f"Text encoder will be loaded from {TEXT_ENCODER_OV_PATH}") - - del text_encoder - gc.collect(); - -U-net -~~~~~ - - - -U-net model has three inputs: - -- ``sample`` - latent image sample from previous step. Generation - process has not been started yet, so you will use random noise. -- ``timestep`` - current scheduler step. -- ``encoder_hidden_state`` - hidden state of text encoder. - -Model predicts the ``sample`` state for the next step. - -.. code:: ipython3 - - import numpy as np - from openvino import PartialShape, Type - - UNET_OV_PATH = Path("unet.xml") - - dtype_mapping = {torch.float32: Type.f32, torch.float64: Type.f64} - - - def convert_unet(unet: torch.nn.Module, ir_path: Path): - """ - Convert U-net model to IR format. - Function accepts unet model, prepares example inputs for conversion, - Parameters: - unet (StableDiffusionPipeline): unet from Stable Diffusion pipeline - ir_path (Path): File for storing model - Returns: - None - """ - # prepare inputs - encoder_hidden_state = torch.ones((2, 77, 768)) - latents_shape = (2, 4, 512 // 8, 512 // 8) - latents = torch.randn(latents_shape) - t = torch.from_numpy(np.array(1, dtype=float)) - dummy_inputs = (latents, t, encoder_hidden_state) - input_info = [] - for input_tensor in dummy_inputs: - shape = PartialShape(tuple(input_tensor.shape)) - element_type = dtype_mapping[input_tensor.dtype] - input_info.append((shape, element_type)) - - unet.eval() - with torch.no_grad(): - ov_model = ov.convert_model(unet, example_input=dummy_inputs, input=input_info) - ov.save_model(ov_model, ir_path) - del ov_model - print(f"Unet successfully converted to IR and saved to {ir_path}") - - - if not UNET_OV_PATH.exists(): - convert_unet(unet, UNET_OV_PATH) - gc.collect() - else: - print(f"Unet will be loaded from {UNET_OV_PATH}") - del unet - gc.collect(); - -VAE -~~~ - - - -The VAE model has two parts, an encoder and a decoder. The encoder is -used to convert the image into a low dimensional latent representation, -which will serve as the input to the U-Net model. The decoder, -conversely, transforms the latent representation back into an image. - -During latent diffusion training, the encoder is used to get the latent -representations (latents) of the images for the forward diffusion -process, which applies more and more noise at each step. During -inference, the denoised latents generated by the reverse diffusion -process are converted back into images using the VAE decoder. When you -run inference for text-to-image, there is no initial image as a starting -point. You can skip this step and directly generate initial random -noise. - -As the encoder and the decoder are used independently in different parts -of the pipeline, it will be better to convert them to separate models. - -.. code:: ipython3 - - VAE_ENCODER_OV_PATH = Path("vae_encodr.xml") - - - def convert_vae_encoder(vae: torch.nn.Module, ir_path: Path): - """ - Convert VAE model for encoding to IR format. - Function accepts vae model, creates wrapper class for export only necessary for inference part, - prepares example inputs for conversion, - Parameters: - vae (torch.nn.Module): VAE model from StableDiffusio pipeline - ir_path (Path): File for storing model - Returns: - None - """ - - class VAEEncoderWrapper(torch.nn.Module): - def __init__(self, vae): - super().__init__() - self.vae = vae - - def forward(self, image): - return self.vae.encode(x=image)["latent_dist"].sample() - - vae_encoder = VAEEncoderWrapper(vae) - vae_encoder.eval() - image = torch.zeros((1, 3, 512, 512)) - with torch.no_grad(): - ov_model = ov.convert_model(vae_encoder, example_input=image, input=[((1, 3, 512, 512),)]) - ov.save_model(ov_model, ir_path) - del ov_model - print(f"VAE encoder successfully converted to IR and saved to {ir_path}") - - - if not VAE_ENCODER_OV_PATH.exists(): - convert_vae_encoder(vae, VAE_ENCODER_OV_PATH) - else: - print(f"VAE encoder will be loaded from {VAE_ENCODER_OV_PATH}") - - VAE_DECODER_OV_PATH = Path("vae_decoder.xml") - - - def convert_vae_decoder(vae: torch.nn.Module, ir_path: Path): - """ - Convert VAE model for decoding to IR format. - Function accepts vae model, creates wrapper class for export only necessary for inference part, - prepares example inputs for conversion, - Parameters: - vae (torch.nn.Module): VAE model frm StableDiffusion pipeline - ir_path (Path): File for storing model - Returns: - None - """ - - class VAEDecoderWrapper(torch.nn.Module): - def __init__(self, vae): - super().__init__() - self.vae = vae - - def forward(self, latents): - return self.vae.decode(latents) - - vae_decoder = VAEDecoderWrapper(vae) - latents = torch.zeros((1, 4, 64, 64)) - - vae_decoder.eval() - with torch.no_grad(): - ov_model = ov.convert_model(vae_decoder, example_input=latents, input=[((1, 4, 64, 64),)]) - ov.save_model(ov_model, ir_path) - del ov_model - print(f"VAE decoder successfully converted to IR and saved to {ir_path}") - - - if not VAE_DECODER_OV_PATH.exists(): - convert_vae_decoder(vae, VAE_DECODER_OV_PATH) - else: - print(f"VAE decoder will be loaded from {VAE_DECODER_OV_PATH}") - - del vae - gc.collect(); - -Prepare Inference Pipeline --------------------------- - - - -Putting it all together, let us now take a closer look at how the model -works in inference by illustrating the logical flow. - -.. figure:: https://user-images.githubusercontent.com/29454499/260981188-c112dd0a-5752-4515-adca-8b09bea5d14a.png - :alt: sd-pipeline - - sd-pipeline - -As you can see from the diagram, the only difference between -Text-to-Image and text-guided Image-to-Image generation in approach is -how initial latent state is generated. In case of Image-to-Image -generation, you additionally have an image encoded by VAE encoder mixed -with the noise produced by using latent seed, while in Text-to-Image you -use only noise as initial latent state. The stable diffusion model takes -both a latent image representation of size :math:`64 \times 64` and a -text prompt is transformed to text embeddings of size -:math:`77 \times 768` via CLIP’s text encoder as an input. - -Next, the U-Net iteratively *denoises* the random latent image -representations while being conditioned on the text embeddings. The -output of the U-Net, being the noise residual, is used to compute a -denoised latent image representation via a scheduler algorithm. Many -different scheduler algorithms can be used for this computation, each -having its pros and cons. For Stable Diffusion, it is recommended to use -one of: - -- `PNDM - scheduler `__ -- `DDIM - scheduler `__ -- `K-LMS - scheduler `__\ (you - will use it in your pipeline) - -Theory on how the scheduler algorithm function works is out of scope for -this notebook. Nonetheless, in short, you should remember that you -compute the predicted denoised image representation from the previous -noise representation and the predicted noise residual. For more -information, refer to the recommended `Elucidating the Design Space of -Diffusion-Based Generative Models `__ - -The *denoising* process is repeated given number of times (by default -50) to step-by-step retrieve better latent image representations. When -complete, the latent image representation is decoded by the decoder part -of the variational auto encoder. - -.. code:: ipython3 - - import inspect - from typing import List, Optional, Union, Dict - - import PIL - import cv2 - - from transformers import CLIPTokenizer - from diffusers.pipelines.pipeline_utils import DiffusionPipeline - from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler - - - def scale_fit_to_window(dst_width: int, dst_height: int, image_width: int, image_height: int): - """ - Preprocessing helper function for calculating image size for resize with peserving original aspect ratio - and fitting image to specific window size - - Parameters: - dst_width (int): destination window width - dst_height (int): destination window height - image_width (int): source image width - image_height (int): source image height - Returns: - result_width (int): calculated width for resize - result_height (int): calculated height for resize - """ - im_scale = min(dst_height / image_height, dst_width / image_width) - return int(im_scale * image_width), int(im_scale * image_height) - - - def preprocess(image: PIL.Image.Image): - """ - Image preprocessing function. Takes image in PIL.Image format, resizes it to keep aspect ration and fits to model input window 512x512, - then converts it to np.ndarray and adds padding with zeros on right or bottom side of image (depends from aspect ratio), after that - converts data to float32 data type and change range of values from [0, 255] to [-1, 1], finally, converts data layout from planar NHWC to NCHW. - The function returns preprocessed input tensor and padding size, which can be used in postprocessing. - - Parameters: - image (PIL.Image.Image): input image - Returns: - image (np.ndarray): preprocessed image tensor - meta (Dict): dictionary with preprocessing metadata info - """ - src_width, src_height = image.size - dst_width, dst_height = scale_fit_to_window(512, 512, src_width, src_height) - image = np.array(image.resize((dst_width, dst_height), resample=PIL.Image.Resampling.LANCZOS))[None, :] - pad_width = 512 - dst_width - pad_height = 512 - dst_height - pad = ((0, 0), (0, pad_height), (0, pad_width), (0, 0)) - image = np.pad(image, pad, mode="constant") - image = image.astype(np.float32) / 255.0 - image = 2.0 * image - 1.0 - image = image.transpose(0, 3, 1, 2) - return image, {"padding": pad, "src_width": src_width, "src_height": src_height} - - - class OVStableDiffusionPipeline(DiffusionPipeline): - def __init__( - self, - vae_decoder: ov.Model, - text_encoder: ov.Model, - tokenizer: CLIPTokenizer, - unet: ov.Model, - scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], - vae_encoder: ov.Model = None, - ): - """ - Pipeline for text-to-image generation using Stable Diffusion. - Parameters: - vae (Model): - Variational Auto-Encoder (VAE) Model to decode images to and from latent representations. - text_encoder (Model): - Frozen text-encoder. Stable Diffusion uses the text portion of - [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically - the clip-vit-large-patch14(https://huggingface.co/openai/clip-vit-large-patch14) variant. - tokenizer (CLIPTokenizer): - Tokenizer of class CLIPTokenizer(https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet (Model): Conditional U-Net architecture to denoise the encoded image latents. - scheduler (SchedulerMixin): - A scheduler to be used in combination with unet to denoise the encoded image latents. Can be one of - DDIMScheduler, LMSDiscreteScheduler, or PNDMScheduler. - """ - super().__init__() - self.scheduler = scheduler - self.vae_decoder = vae_decoder - self.vae_encoder = vae_encoder - self.text_encoder = text_encoder - self.unet = unet - self._text_encoder_output = text_encoder.output(0) - self._unet_output = unet.output(0) - self._vae_d_output = vae_decoder.output(0) - self._vae_e_output = vae_encoder.output(0) if vae_encoder is not None else None - self.height = 512 - self.width = 512 - self.tokenizer = tokenizer - - def __call__( - self, - prompt: Union[str, List[str]], - image: PIL.Image.Image = None, - num_inference_steps: Optional[int] = 50, - negative_prompt: Union[str, List[str]] = None, - guidance_scale: Optional[float] = 7.5, - eta: Optional[float] = 0.0, - output_type: Optional[str] = "pil", - seed: Optional[int] = None, - strength: float = 1.0, - gif: Optional[bool] = False, - **kwargs, - ): - """ - Function invoked when calling the pipeline for generation. - Parameters: - prompt (str or List[str]): - The prompt or prompts to guide the image generation. - image (PIL.Image.Image, *optional*, None): - Intinal image for generation. - num_inference_steps (int, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - negative_prompt (str or List[str]): - The negative prompt or prompts to guide the image generation. - guidance_scale (float, *optional*, defaults to 7.5): - Guidance scale as defined in Classifier-Free Diffusion Guidance(https://arxiv.org/abs/2207.12598). - guidance_scale is defined as `w` of equation 2. - Higher guidance scale encourages to generate images that are closely linked to the text prompt, - usually at the expense of lower image quality. - eta (float, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [DDIMScheduler], will be ignored for others. - output_type (`str`, *optional*, defaults to "pil"): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): PIL.Image.Image or np.array. - seed (int, *optional*, None): - Seed for random generator state initialization. - gif (bool, *optional*, False): - Flag for storing all steps results or not. - Returns: - Dictionary with keys: - sample - the last generated image PIL.Image.Image or np.array - iterations - *optional* (if gif=True) images for all diffusion steps, List of PIL.Image.Image or np.array. - """ - if seed is not None: - np.random.seed(seed) - - img_buffer = [] - do_classifier_free_guidance = guidance_scale > 1.0 - # get prompt text embeddings - text_embeddings = self._encode_prompt( - prompt, - do_classifier_free_guidance=do_classifier_free_guidance, - negative_prompt=negative_prompt, - ) - - # set timesteps - accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys()) - extra_set_kwargs = {} - if accepts_offset: - extra_set_kwargs["offset"] = 1 - - self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) - latent_timestep = timesteps[:1] - - # get the initial random noise unless the user supplied it - latents, meta = self.prepare_latents(image, latent_timestep) - - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta - - for i, t in enumerate(self.progress_bar(timesteps)): - # expand the latents if you are doing classifier free guidance - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - - # predict the noise residual - noise_pred = self.unet([latent_model_input, t, text_embeddings])[self._unet_output] - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1] - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step( - torch.from_numpy(noise_pred), - t, - torch.from_numpy(latents), - **extra_step_kwargs, - )["prev_sample"].numpy() - if gif: - image = self.vae_decoder(latents * (1 / 0.18215))[self._vae_d_output] - image = self.postprocess_image(image, meta, output_type) - img_buffer.extend(image) - - # scale and decode the image latents with vae - image = self.vae_decoder(latents * (1 / 0.18215))[self._vae_d_output] - - image = self.postprocess_image(image, meta, output_type) - return {"sample": image, "iterations": img_buffer} - - def _encode_prompt( - self, - prompt: Union[str, List[str]], - num_images_per_prompt: int = 1, - do_classifier_free_guidance: bool = True, - negative_prompt: Union[str, List[str]] = None, - ): - """ - Encodes the prompt into text encoder hidden states. - - Parameters: - prompt (str or list(str)): prompt to be encoded - num_images_per_prompt (int): number of images that should be generated per prompt - do_classifier_free_guidance (bool): whether to use classifier free guidance or not - negative_prompt (str or list(str)): negative prompt to be encoded - Returns: - text_embeddings (np.ndarray): text encoder hidden states - """ - batch_size = len(prompt) if isinstance(prompt, list) else 1 - - # tokenize input prompts - text_inputs = self.tokenizer( - prompt, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="np", - ) - text_input_ids = text_inputs.input_ids - - text_embeddings = self.text_encoder(text_input_ids)[self._text_encoder_output] - - # duplicate text embeddings for each generation per prompt - if num_images_per_prompt != 1: - bs_embed, seq_len, _ = text_embeddings.shape - text_embeddings = np.tile(text_embeddings, (1, num_images_per_prompt, 1)) - text_embeddings = np.reshape(text_embeddings, (bs_embed * num_images_per_prompt, seq_len, -1)) - - # get unconditional embeddings for classifier free guidance - if do_classifier_free_guidance: - uncond_tokens: List[str] - max_length = text_input_ids.shape[-1] - if negative_prompt is None: - uncond_tokens = [""] * batch_size - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] - else: - uncond_tokens = negative_prompt - uncond_input = self.tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="np", - ) - - uncond_embeddings = self.text_encoder(uncond_input.input_ids)[self._text_encoder_output] - - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - seq_len = uncond_embeddings.shape[1] - uncond_embeddings = np.tile(uncond_embeddings, (1, num_images_per_prompt, 1)) - uncond_embeddings = np.reshape(uncond_embeddings, (batch_size * num_images_per_prompt, seq_len, -1)) - - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - text_embeddings = np.concatenate([uncond_embeddings, text_embeddings]) - - return text_embeddings - - def prepare_latents(self, image: PIL.Image.Image = None, latent_timestep: torch.Tensor = None): - """ - Function for getting initial latents for starting generation - - Parameters: - image (PIL.Image.Image, *optional*, None): - Input image for generation, if not provided randon noise will be used as starting point - latent_timestep (torch.Tensor, *optional*, None): - Predicted by scheduler initial step for image generation, required for latent image mixing with nosie - Returns: - latents (np.ndarray): - Image encoded in latent space - """ - latents_shape = (1, 4, self.height // 8, self.width // 8) - noise = np.random.randn(*latents_shape).astype(np.float32) - if image is None: - # if you use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas - if isinstance(self.scheduler, LMSDiscreteScheduler): - noise = noise * self.scheduler.sigmas[0].numpy() - return noise, {} - input_image, meta = preprocess(image) - latents = self.vae_encoder(input_image)[self._vae_e_output] * 0.18215 - latents = self.scheduler.add_noise(torch.from_numpy(latents), torch.from_numpy(noise), latent_timestep).numpy() - return latents, meta - - def postprocess_image(self, image: np.ndarray, meta: Dict, output_type: str = "pil"): - """ - Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initila image size (if required), - normalize and convert to [0, 255] pixels range. Optionally, convertes it from np.ndarray to PIL.Image format - - Parameters: - image (np.ndarray): - Generated image - meta (Dict): - Metadata obtained on latents preparing step, can be empty - output_type (str, *optional*, pil): - Output format for result, can be pil or numpy - Returns: - image (List of np.ndarray or PIL.Image.Image): - Postprocessed images - """ - if "padding" in meta: - pad = meta["padding"] - (_, end_h), (_, end_w) = pad[1:3] - h, w = image.shape[2:] - unpad_h = h - end_h - unpad_w = w - end_w - image = image[:, :, :unpad_h, :unpad_w] - image = np.clip(image / 2 + 0.5, 0, 1) - image = np.transpose(image, (0, 2, 3, 1)) - # 9. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) - if "src_height" in meta: - orig_height, orig_width = meta["src_height"], meta["src_width"] - image = [img.resize((orig_width, orig_height), PIL.Image.Resampling.LANCZOS) for img in image] - else: - if "src_height" in meta: - orig_height, orig_width = meta["src_height"], meta["src_width"] - image = [cv2.resize(img, (orig_width, orig_width)) for img in image] - return image - - def get_timesteps(self, num_inference_steps: int, strength: float): - """ - Helper function for getting scheduler timesteps for generation - In case of image-to-image generation, it updates number of steps according to strength - - Parameters: - num_inference_steps (int): - number of inference steps for generation - strength (float): - value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. - Values that approach 1.0 enable lots of variations but will also produce images that are not semantically consistent with the input. - """ - # get the original timestep using init_timestep - init_timestep = min(int(num_inference_steps * strength), num_inference_steps) - - t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start:] - - return timesteps, num_inference_steps - t_start - -Configure Inference Pipeline ----------------------------- - - - -First, you should create instances of OpenVINO Model. - -.. code:: ipython3 - - core = ov.Core() - -Select device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - core = ov.Core() - - text_enc = core.compile_model(TEXT_ENCODER_OV_PATH, device.value) - -Calibrate UNet for GPU inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -On a GPU device a model is executed in FP16 precision. For Tiny-SD UNet -model there known to be accuracy issues caused by this. Therefore, a -special calibration procedure is used to selectively mark some -operations to be executed in full precision. - -.. code:: ipython3 - - import pickle - import requests - import os - - # Fetch `model_upcast_utils` which helps to restore accuracy when inferred on GPU - r = requests.get("https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/model_upcast_utils.py") - with open("model_upcast_utils.py", "w") as f: - f.write(r.text) - - # Fetch an example input for UNet model needed for upcasting calibration process - r = requests.get("https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/pkl/unet_calibration_example_input.pkl") - with open("unet_calibration_example_input.pkl", "wb") as f: - f.write(r.content) - - from model_upcast_utils import ( - is_model_partially_upcasted, - partially_upcast_nodes_to_fp32, - ) - - unet_model = core.read_model(UNET_OV_PATH) - if "GPU" in core.available_devices and not is_model_partially_upcasted(unet_model): - with open("unet_calibration_example_input.pkl", "rb") as f: - example_input = pickle.load(f) - unet_model = partially_upcast_nodes_to_fp32(unet_model, example_input, upcast_ratio=0.7, operation_types=["Convolution"]) - - ov.save_model(unet_model, UNET_OV_PATH.with_suffix("._tmp.xml")) - del unet_model - os.remove(UNET_OV_PATH) - os.remove(str(UNET_OV_PATH).replace(".xml", ".bin")) - UNET_OV_PATH.with_suffix("._tmp.xml").rename(UNET_OV_PATH) - UNET_OV_PATH.with_suffix("._tmp.bin").rename(UNET_OV_PATH.with_suffix(".bin")) - -.. code:: ipython3 - - unet_model = core.compile_model(UNET_OV_PATH, device.value) - -.. code:: ipython3 - - ov_config = {"INFERENCE_PRECISION_HINT": "f32"} if device.value != "CPU" else {} - - vae_decoder = core.compile_model(VAE_DECODER_OV_PATH, device.value, ov_config) - vae_encoder = core.compile_model(VAE_ENCODER_OV_PATH, device.value, ov_config) - -Model tokenizer and scheduler are also important parts of the pipeline. -Let us define them and put all components together - -.. code:: ipython3 - - from transformers import CLIPTokenizer - from diffusers.schedulers import LMSDiscreteScheduler - - lms = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear") - tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") - - ov_pipe = OVStableDiffusionPipeline( - tokenizer=tokenizer, - text_encoder=text_enc, - unet=unet_model, - vae_encoder=vae_encoder, - vae_decoder=vae_decoder, - scheduler=lms, - ) - -Text-to-Image generation -~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Now, let’s see model in action - -.. code:: ipython3 - - text_prompt = "RAW studio photo of An intricate forest minitown landscape trapped in a bottle, atmospheric oliva lighting, on the table, intricate details, dark shot, soothing tones, muted colors " - seed = 431 - num_steps = 20 - -.. code:: ipython3 - - print("Pipeline settings") - print(f"Input text: {text_prompt}") - print(f"Seed: {seed}") - print(f"Number of steps: {num_steps}") - - -.. parsed-literal:: - - Pipeline settings - Input text: RAW studio photo of An intricate forest minitown landscape trapped in a bottle, atmospheric oliva lighting, on the table, intricate details, dark shot, soothing tones, muted colors - Seed: 431 - Number of steps: 20 - - -.. code:: ipython3 - - result = ov_pipe(text_prompt, num_inference_steps=num_steps, seed=seed) - - - -.. parsed-literal:: - - 0%| | 0/20 [00:00`__. - -.. code:: ipython3 - - text_prompt_i2i = "professional photo portrait of woman, highly detailed, hyper realistic, cinematic effects, soft lighting" - negative_prompt_i2i = ( - "blurry, poor quality, low res, worst quality, cropped, ugly, poorly drawn face, without eyes, mutation, unreal, animate, poorly drawn eyes" - ) - num_steps_i2i = 40 - seed_i2i = 82698152 - strength = 0.68 - -.. code:: ipython3 - - from diffusers.utils import load_image - - default_image_url = "https://user-images.githubusercontent.com/29454499/260418860-69cc443a-9ee6-493c-a393-3a97af080be7.jpg" - # read uploaded image - image = load_image(default_image_url) - print("Pipeline settings") - print(f"Input positive prompt: \n\t{text_prompt_i2i}") - print(f"Input negative prompt: \n\t{negative_prompt_i2i}") - print(f"Seed: {seed_i2i}") - print(f"Number of steps: {num_steps_i2i}") - print(f"Strength: {strength}") - print("Input image:") - display(image) - processed_image = ov_pipe( - text_prompt_i2i, - image, - negative_prompt=negative_prompt_i2i, - num_inference_steps=num_steps_i2i, - seed=seed_i2i, - strength=strength, - ) - - -.. parsed-literal:: - - Pipeline settings - Input positive prompt: - professional photo portrait of woman, highly detailed, hyper realistic, cinematic effects, soft lighting - Input negative prompt: - blurry, poor quality, low res, worst quality, cropped, ugly, poorly drawn face, without eyes, mutation, unreal, animate, poorly drawn eyes - Seed: 82698152 - Number of steps: 40 - Strength: 0.68 - Input image: - - - -.. image:: tiny-sd-image-generation-with-output_files/tiny-sd-image-generation-with-output_39_1.png - - - -.. parsed-literal:: - - 0%| | 0/27 [00:00`__: -`vehicle-detection-0200 `__ -for object detection and -`vehicle-attributes-recognition-barrier-0039 `__ -for image classification. Using these models, you will detect vehicles -from raw images and recognize attributes of detected vehicles. -|flowchart| - -As a result, you can get: - -.. figure:: https://user-images.githubusercontent.com/47499836/157867020-99738b30-62ca-44e2-8d9e-caf13fb724ed.png - :alt: result - - result - - -**Table of contents:** - - -- `Imports <#imports>`__ -- `Download Models <#download-models>`__ -- `Load Models <#load-models>`__ - - - `Get attributes from model <#get-attributes-from-model>`__ - - `Helper function <#helper-function>`__ - - `Read and display a test image <#read-and-display-a-test-image>`__ - -- `Use the Detection Model to Detect - Vehicles <#use-the-detection-model-to-detect-vehicles>`__ - - - `Detection Processing <#detection-processing>`__ - - `Recognize vehicle attributes <#recognize-vehicle-attributes>`__ - - - `Recognition processing <#recognition-processing>`__ - - - `Combine two models <#combine-two-models>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -.. |flowchart| image:: https://user-images.githubusercontent.com/47499836/157867076-9e997781-f9ef-45f6-9a51-b515bbf41048.png - -Imports -------- - - - -Import the required modules. - -.. code:: ipython3 - - %pip install -q "openvino>=2023.1.0" opencv-python tqdm - - %pip install -q "matplotlib>=3.4" - -.. code:: ipython3 - - import os - from pathlib import Path - from typing import Tuple - - import cv2 - import numpy as np - import matplotlib.pyplot as plt - import openvino as ov - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - import notebook_utils as utils - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("vehicle-detection-and-recognition.ipynb") - -Download Models ---------------- - - - -Download pretrained models from -https://storage.openvinotoolkit.org/repositories/open_model_zoo. If the -model is already downloaded, this step is skipped. - - **Note**: To change the model, replace the name of the model in the - code below, for example to ``"vehicle-detection-0201"`` or - ``"vehicle-detection-0202"``. Keep in mind that they support - different image input sizes in detection. Also, you can change the - recognition model to - ``"vehicle-attributes-recognition-barrier-0042"``. They are trained - from different deep learning frames. Therefore, if you want to change - the precision, you need to modify the precision value in ``"FP32"``, - ``"FP16"``, and ``"FP16-INT8"``. A different type has a different - model size and a precision value. - -.. code:: ipython3 - - # A directory where the model will be downloaded. - base_model_dir = Path("model") - # The name of the model from Open Model Zoo. - detection_model_name = "vehicle-detection-0200" - recognition_model_name = "vehicle-attributes-recognition-barrier-0039" - # Selected precision (FP32, FP16, FP16-INT8) - precision = "FP32" - - base_model_url = "https://storage.openvinotoolkit.org/repositories/open_model_zoo/2023.0/models_bin/1" - - # Check if the model exists. - detection_model_url = f"{base_model_url}/{detection_model_name}/{precision}/{detection_model_name}.xml" - recognition_model_url = f"{base_model_url}/{recognition_model_name}/{precision}/{recognition_model_name}.xml" - detection_model_path = (base_model_dir / detection_model_name).with_suffix(".xml") - recognition_model_path = (base_model_dir / recognition_model_name).with_suffix(".xml") - - # Download the detection model. - if not detection_model_path.exists(): - utils.download_file(detection_model_url, detection_model_name + ".xml", base_model_dir) - utils.download_file( - detection_model_url.replace(".xml", ".bin"), - detection_model_name + ".bin", - base_model_dir, - ) - # Download the recognition model. - if not os.path.exists(recognition_model_path): - utils.download_file(recognition_model_url, recognition_model_name + ".xml", base_model_dir) - utils.download_file( - recognition_model_url.replace(".xml", ".bin"), - recognition_model_name + ".bin", - base_model_dir, - ) - -Load Models ------------ - - - -This tutorial requires a detection model and a recognition model. After -downloading the models, initialize OpenVINO Runtime, and use -``read_model()`` to read network architecture and weights from ``*.xml`` -and ``*.bin`` files. Then, compile it with ``compile_model()`` to the -specified device. - -.. code:: ipython3 - - device = utils.device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - # Initialize OpenVINO Runtime runtime. - core = ov.Core() - - - def model_init(model_path: str) -> Tuple: - """ - Read the network and weights from file, load the - model on the CPU and get input and output names of nodes - - :param: model: model architecture path *.xml - :retuns: - input_key: Input node network - output_key: Output node network - exec_net: Encoder model network - net: Model network - """ - - # Read the network and corresponding weights from a file. - model = core.read_model(model=model_path) - compiled_model = core.compile_model(model=model, device_name=device.value) - # Get input and output names of nodes. - input_keys = compiled_model.input(0) - output_keys = compiled_model.output(0) - return input_keys, output_keys, compiled_model - -Get attributes from model -~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Use ``input_keys.shape`` to get data shapes. - -.. code:: ipython3 - - # de -> detection - # re -> recognition - # Detection model initialization. - input_key_de, output_keys_de, compiled_model_de = model_init(detection_model_path) - # Recognition model initialization. - input_key_re, output_keys_re, compiled_model_re = model_init(recognition_model_path) - - # Get input size - Detection. - height_de, width_de = list(input_key_de.shape)[2:] - # Get input size - Recognition. - height_re, width_re = list(input_key_re.shape)[2:] - -Helper function -~~~~~~~~~~~~~~~ - - - -The ``plt_show()`` function is used to show image. - -.. code:: ipython3 - - def plt_show(raw_image): - """ - Use matplot to show image inline - raw_image: input image - - :param: raw_image:image array - """ - plt.figure(figsize=(10, 6)) - plt.axis("off") - plt.imshow(raw_image) - -Read and display a test image -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The input shape of detection model is ``[1, 3, 256, 256]``. Therefore, -you need to resize the image to ``256 x 256``, and expand the batch -channel with ``expand_dims`` function. - -.. code:: ipython3 - - # Load an image. - url = "https://storage.openvinotoolkit.org/data/test_data/images/person-bicycle-car-detection.bmp" - filename = "cars.jpg" - directory = "data" - - image_file = Path(directory) / filename - if not image_file.exists(): - utils.download_file( - url, - filename=filename, - directory=directory, - show_progress=False, - ) - - # Read the image. - image_de = cv2.imread("data/cars.jpg") - # Resize it to [3, 256, 256]. - resized_image_de = cv2.resize(image_de, (width_de, height_de)) - # Expand the batch channel to [1, 3, 256, 256]. - input_image_de = np.expand_dims(resized_image_de.transpose(2, 0, 1), 0) - # Show the image. - plt_show(cv2.cvtColor(image_de, cv2.COLOR_BGR2RGB)) - - - -.. image:: vehicle-detection-and-recognition-with-output_files/vehicle-detection-and-recognition-with-output_14_0.png - - -Use the Detection Model to Detect Vehicles ------------------------------------------- - - - -.. figure:: https://user-images.githubusercontent.com/47499836/157867076-9e997781-f9ef-45f6-9a51-b515bbf41048.png - :alt: pipline - - pipline - -As shown in the flowchart, images of individual vehicles are sent to the -recognition model. First, use ``infer`` function to get the result. - -The detection model output has the format -``[image_id, label, conf, x_min, y_min, x_max, y_max]``, where: - -- ``image_id`` - ID of the image in the batch -- ``label`` - predicted class ID (0 - vehicle) -- ``conf`` - confidence for the predicted class -- ``(x_min, y_min)`` - coordinates of the top left bounding box corner -- ``(x_max, y_max)`` - coordinates of the bottom right bounding box - corner - -Delete unused dims and filter out results that are not used. - -.. code:: ipython3 - - # Run inference. - boxes = compiled_model_de([input_image_de])[output_keys_de] - # Delete the dim of 0, 1. - boxes = np.squeeze(boxes, (0, 1)) - # Remove zero only boxes. - boxes = boxes[~np.all(boxes == 0, axis=1)] - -Detection Processing -~~~~~~~~~~~~~~~~~~~~ - - - -With the function below, you change the ratio to the real position in -the image and filter out low-confidence results. - -.. code:: ipython3 - - def crop_images(bgr_image, resized_image, boxes, threshold=0.6) -> np.ndarray: - """ - Use bounding boxes from detection model to find the absolute car position - - :param: bgr_image: raw image - :param: resized_image: resized image - :param: boxes: detection model returns rectangle position - :param: threshold: confidence threshold - :returns: car_position: car's absolute position - """ - # Fetch image shapes to calculate ratio - (real_y, real_x), (resized_y, resized_x) = ( - bgr_image.shape[:2], - resized_image.shape[:2], - ) - ratio_x, ratio_y = real_x / resized_x, real_y / resized_y - - # Find the boxes ratio - boxes = boxes[:, 2:] - # Store the vehicle's position - car_position = [] - # Iterate through non-zero boxes - for box in boxes: - # Pick confidence factor from last place in array - conf = box[0] - if conf > threshold: - # Convert float to int and multiply corner position of each box by x and y ratio - # In case that bounding box is found at the top of the image, - # upper box bar should be positioned a little bit lower to make it visible on image - (x_min, y_min, x_max, y_max) = [ - (int(max(corner_position * ratio_y * resized_y, 10)) if idx % 2 else int(corner_position * ratio_x * resized_x)) - for idx, corner_position in enumerate(box[1:]) - ] - - car_position.append([x_min, y_min, x_max, y_max]) - - return car_position - -.. code:: ipython3 - - # Find the position of a car. - car_position = crop_images(image_de, resized_image_de, boxes) - -Recognize vehicle attributes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Select one of the detected boxes. Then, crop to an area containing a -vehicle to test with the recognition model. Again, you need to resize -the input image and run inference. - -.. code:: ipython3 - - # Select a vehicle to recognize. - pos = car_position[0] - # Crop the image with [y_min:y_max, x_min:x_max]. - test_car = image_de[pos[1] : pos[3], pos[0] : pos[2]] - # Resize the image to input_size. - resized_image_re = cv2.resize(test_car, (width_re, height_re)) - input_image_re = np.expand_dims(resized_image_re.transpose(2, 0, 1), 0) - plt_show(cv2.cvtColor(resized_image_re, cv2.COLOR_BGR2RGB)) - - - -.. image:: vehicle-detection-and-recognition-with-output_files/vehicle-detection-and-recognition-with-output_21_0.png - - -Recognition processing -'''''''''''''''''''''' - - - -The result contains colors of the vehicles (white, gray, yellow, red, -green, blue, black) and types of vehicles (car, bus, truck, van). Next, -you need to calculate the probability of each attribute. Then, you -determine the maximum probability as the result. - -.. code:: ipython3 - - def vehicle_recognition(compiled_model_re, input_size, raw_image): - """ - Vehicle attributes recognition, input a single vehicle, return attributes - :param: compiled_model_re: recognition net - :param: input_size: recognition input size - :param: raw_image: single vehicle image - :returns: attr_color: predicted color - attr_type: predicted type - """ - # An attribute of a vehicle. - colors = ["White", "Gray", "Yellow", "Red", "Green", "Blue", "Black"] - types = ["Car", "Bus", "Truck", "Van"] - - # Resize the image to input size. - resized_image_re = cv2.resize(raw_image, input_size) - input_image_re = np.expand_dims(resized_image_re.transpose(2, 0, 1), 0) - - # Run inference. - # Predict result. - predict_colors = compiled_model_re([input_image_re])[compiled_model_re.output(1)] - # Delete the dim of 2, 3. - predict_colors = np.squeeze(predict_colors, (2, 3)) - predict_types = compiled_model_re([input_image_re])[compiled_model_re.output(0)] - predict_types = np.squeeze(predict_types, (2, 3)) - - attr_color, attr_type = ( - colors[np.argmax(predict_colors)], - types[np.argmax(predict_types)], - ) - return attr_color, attr_type - -.. code:: ipython3 - - print(f"Attributes:{vehicle_recognition(compiled_model_re, (72, 72), test_car)}") - - -.. parsed-literal:: - - Attributes:('Gray', 'Car') - - -Combine two models -~~~~~~~~~~~~~~~~~~ - - - -Congratulations! You successfully used a detection model to crop an -image with a vehicle and recognize the attributes of a vehicle. - -.. code:: ipython3 - - def convert_result_to_image(compiled_model_re, bgr_image, resized_image, boxes, threshold=0.6): - """ - Use Detection model boxes to draw rectangles and plot the result - - :param: compiled_model_re: recognition net - :param: input_key_re: recognition input key - :param: bgr_image: raw image - :param: resized_image: resized image - :param: boxes: detection model returns rectangle position - :param: threshold: confidence threshold - :returns: rgb_image: processed image - """ - # Define colors for boxes and descriptions. - colors = {"red": (255, 0, 0), "green": (0, 255, 0)} - - # Convert the base image from BGR to RGB format. - rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB) - - # Find positions of cars. - car_position = crop_images(image_de, resized_image, boxes) - - for x_min, y_min, x_max, y_max in car_position: - # Run vehicle recognition inference. - attr_color, attr_type = vehicle_recognition(compiled_model_re, (72, 72), image_de[y_min:y_max, x_min:x_max]) - - # Close the window with a vehicle. - plt.close() - - # Draw a bounding box based on position. - # Parameters in the `rectangle` function are: image, start_point, end_point, color, thickness. - rgb_image = cv2.rectangle(rgb_image, (x_min, y_min), (x_max, y_max), colors["red"], 2) - - # Print the attributes of a vehicle. - # Parameters in the `putText` function are: img, text, org, fontFace, fontScale, color, thickness, lineType. - rgb_image = cv2.putText( - rgb_image, - f"{attr_color} {attr_type}", - (x_min, y_min - 10), - cv2.FONT_HERSHEY_SIMPLEX, - 2, - colors["green"], - 10, - cv2.LINE_AA, - ) - - return rgb_image - -.. code:: ipython3 - - plt_show(convert_result_to_image(compiled_model_re, image_de, resized_image_de, boxes)) - - - -.. image:: vehicle-detection-and-recognition-with-output_files/vehicle-detection-and-recognition-with-output_27_0.png - diff --git a/docs/notebooks/vehicle-detection-and-recognition-with-output_files/vehicle-detection-and-recognition-with-output_14_0.png b/docs/notebooks/vehicle-detection-and-recognition-with-output_files/vehicle-detection-and-recognition-with-output_14_0.png deleted file mode 100644 index 0b217b67f4f27c..00000000000000 --- a/docs/notebooks/vehicle-detection-and-recognition-with-output_files/vehicle-detection-and-recognition-with-output_14_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bdc1690871309f5df1aa9a81ec40df1a119148cabaeeb8191865a0ddc476c8bb -size 172680 diff --git a/docs/notebooks/vehicle-detection-and-recognition-with-output_files/vehicle-detection-and-recognition-with-output_21_0.png b/docs/notebooks/vehicle-detection-and-recognition-with-output_files/vehicle-detection-and-recognition-with-output_21_0.png deleted file mode 100644 index e18fb7780e0c05..00000000000000 --- a/docs/notebooks/vehicle-detection-and-recognition-with-output_files/vehicle-detection-and-recognition-with-output_21_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2006105efd929fe1b58f665276da47a6269861cfbf835cfaac1b4e72e8b1573b -size 19599 diff --git a/docs/notebooks/vehicle-detection-and-recognition-with-output_files/vehicle-detection-and-recognition-with-output_27_0.png b/docs/notebooks/vehicle-detection-and-recognition-with-output_files/vehicle-detection-and-recognition-with-output_27_0.png deleted file mode 100644 index d330362dc3dcb8..00000000000000 --- a/docs/notebooks/vehicle-detection-and-recognition-with-output_files/vehicle-detection-and-recognition-with-output_27_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:399e2d167f88c78b9fd3cf1cfe1ca6c424b1f6de980a21188e676448b5a6f4f6 -size 175941 diff --git a/docs/notebooks/vision-background-removal-with-output.rst b/docs/notebooks/vision-background-removal-with-output.rst deleted file mode 100644 index 1d489a957441c8..00000000000000 --- a/docs/notebooks/vision-background-removal-with-output.rst +++ /dev/null @@ -1,436 +0,0 @@ -Image Background Removal with U^2-Net and OpenVINO™ -=================================================== - -This notebook demonstrates background removal in images using -U^2-Net and OpenVINO. - -For more information about U^2-Net, including source code and -test data, see the `GitHub -page `__ and the research paper: -`U^2-Net: Going Deeper with Nested U-Structure for Salient Object -Detection `__. - -The PyTorch U^2-Net model is converted to OpenVINO IR format. -The model source is available -`here `__. - - -**Table of contents:** - - -- `Preparation <#preparation>`__ - - - `Install requirements <#install-requirements>`__ - - `Import the PyTorch Library and - U^2-Net <#import-the-pytorch-library-and-u2-net>`__ - - `Settings <#settings>`__ - - `Load the U^2-Net Model <#load-the-u2-net-model>`__ - -- `Convert PyTorch U^2-Net model to OpenVINO - IR <#convert-pytorch-u2-net-model-to-openvino-ir>`__ -- `Load and Pre-Process Input - Image <#load-and-pre-process-input-image>`__ -- `Select inference device <#select-inference-device>`__ -- `Do Inference on OpenVINO IR - Model <#do-inference-on-openvino-ir-model>`__ -- `Visualize Results <#visualize-results>`__ - - - `Add a Background Image <#add-a-background-image>`__ - -- `References <#references>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Preparation ------------ - - - -Install requirements -~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %pip install -q "openvino>=2023.1.0" - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "torch>=2.1" opencv-python - %pip install -q "gdown<4.6.4" - - %pip install -q "matplotlib>=3.4" - -Import the PyTorch Library and U^2-Net -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - import os - import time - from collections import namedtuple - from pathlib import Path - - import cv2 - import matplotlib.pyplot as plt - import numpy as np - import openvino as ov - import torch - from IPython.display import HTML, FileLink, display - -.. code:: ipython3 - - # Import local modules - import requests - - if not Path("./notebook_utils.py").exists(): - # Fetch `notebook_utils` module - - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import load_image, download_file, device_widget - - if not Path("./model/u2net.py").exists(): - download_file( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/vision-background-removal/model/u2net.py", directory="model" - ) - from model.u2net import U2NET, U2NETP - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("vision-background-removal.ipynb") - -Settings -~~~~~~~~ - - - -This tutorial supports using the original U^2-Net salient -object detection model, as well as the smaller U2NETP version. Two sets -of weights are supported for the original model: salient object -detection and human segmentation. - -.. code:: ipython3 - - model_config = namedtuple("ModelConfig", ["name", "url", "model", "model_args"]) - - u2net_lite = model_config( - name="u2net_lite", - url="https://drive.google.com/uc?id=1W8E4FHIlTVstfRkYmNOjbr0VDXTZm0jD", - model=U2NETP, - model_args=(), - ) - u2net = model_config( - name="u2net", - url="https://drive.google.com/uc?id=1ao1ovG1Qtx4b7EoskHXmi2E9rp5CHLcZ", - model=U2NET, - model_args=(3, 1), - ) - u2net_human_seg = model_config( - name="u2net_human_seg", - url="https://drive.google.com/uc?id=1m_Kgs91b21gayc2XLW0ou8yugAIadWVP", - model=U2NET, - model_args=(3, 1), - ) - - # Set u2net_model to one of the three configurations listed above. - u2net_model = u2net_lite - -.. code:: ipython3 - - # The filenames of the downloaded and converted models. - MODEL_DIR = "model" - model_path = Path(MODEL_DIR) / u2net_model.name / Path(u2net_model.name).with_suffix(".pth") - -Load the U^2-Net Model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The U^2-Net human segmentation model weights are stored on -Google Drive. They will be downloaded if they are not present yet. The -next cell loads the model and the pre-trained weights. - -.. code:: ipython3 - - if not model_path.exists(): - import gdown - - os.makedirs(name=model_path.parent, exist_ok=True) - print("Start downloading model weights file... ") - with open(model_path, "wb") as model_file: - gdown.download(url=u2net_model.url, output=model_file) - print(f"Model weights have been downloaded to {model_path}") - - -.. parsed-literal:: - - Start downloading model weights file... - - -.. parsed-literal:: - - Downloading... - From: https://drive.google.com/uc?id=1rbSTGKAE-MTxBYHd-51l2hMOQPT_7EPy - To: <_io.BufferedWriter name='model/u2net_lite/u2net_lite.pth'> - 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.68M/4.68M [00:01<00:00, 4.19MB/s] - -.. parsed-literal:: - - Model weights have been downloaded to model/u2net_lite/u2net_lite.pth - - - - - - - -.. code:: ipython3 - - # Load the model. - net = u2net_model.model(*u2net_model.model_args) - net.eval() - - # Load the weights. - print(f"Loading model weights from: '{model_path}'") - net.load_state_dict(state_dict=torch.load(model_path, map_location="cpu")) - - -.. parsed-literal:: - - Loading model weights from: 'model/u2net_lite/u2net_lite.pth' - - - - -.. parsed-literal:: - - - - - -Convert PyTorch U^2-Net model to OpenVINO IR ------------------------------------------------------- - - - -We use model conversion Python API to convert the Pytorch model to -OpenVINO IR format. Executing the following command may take a while. - -.. code:: ipython3 - - model_ir = ov.convert_model(net, example_input=torch.zeros((1, 3, 512, 512)), input=([1, 3, 512, 512])) - -Load and Pre-Process Input Image --------------------------------- - - - -While OpenCV reads images in ``BGR`` format, the OpenVINO IR model -expects images in ``RGB``. Therefore, convert the images to ``RGB``, -resize them to ``512 x 512``, and transpose the dimensions to the format -the OpenVINO IR model expects. - -We add the mean values to the image tensor and scale the input with the -standard deviation. It is called the input data normalization before -propagating it through the network. The mean and standard deviation -values can be found in the -`dataloader `__ -file in the `U^2-Net -repository `__ and multiplied by -255 to support images with pixel values from 0-255. - -.. code:: ipython3 - - IMAGE_URI = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_hollywood.jpg" - IMAGE_NAME = "coco_hollywood.jpg" - - input_mean = np.array([123.675, 116.28, 103.53]).reshape(1, 3, 1, 1) - input_scale = np.array([58.395, 57.12, 57.375]).reshape(1, 3, 1, 1) - - image = cv2.cvtColor( - src=load_image(IMAGE_NAME, IMAGE_URI), - code=cv2.COLOR_BGR2RGB, - ) - - resized_image = cv2.resize(src=image, dsize=(512, 512)) - # Convert the image shape to a shape and a data type expected by the network - # for OpenVINO IR model: (1, 3, 512, 512). - input_image = np.expand_dims(np.transpose(resized_image, (2, 0, 1)), 0) - - input_image = (input_image - input_mean) / input_scale - -Select inference device ------------------------ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO') - - - -Do Inference on OpenVINO IR Model ---------------------------------- - - - -Load the OpenVINO IR model to OpenVINO Runtime and do inference. - -.. code:: ipython3 - - core = ov.Core() - # Load the network to OpenVINO Runtime. - compiled_model_ir = core.compile_model(model=model_ir, device_name=device.value) - # Get the names of input and output layers. - input_layer_ir = compiled_model_ir.input(0) - output_layer_ir = compiled_model_ir.output(0) - - # Do inference on the input image. - start_time = time.perf_counter() - result = compiled_model_ir([input_image])[output_layer_ir] - end_time = time.perf_counter() - print(f"Inference finished. Inference time: {end_time-start_time:.3f} seconds, " f"FPS: {1/(end_time-start_time):.2f}.") - - -.. parsed-literal:: - - Inference finished. Inference time: 0.097 seconds, FPS: 10.33. - - -Visualize Results ------------------ - - - -Show the original image, the segmentation result, and the original image -with the background removed. - -.. code:: ipython3 - - # Resize the network result to the image shape and round the values - # to 0 (background) and 1 (foreground). - # The network result has (1,1,512,512) shape. The `np.squeeze` function converts this to (512, 512). - resized_result = np.rint(cv2.resize(src=np.squeeze(result), dsize=(image.shape[1], image.shape[0]))).astype(np.uint8) - - # Create a copy of the image and set all background values to 255 (white). - bg_removed_result = image.copy() - bg_removed_result[resized_result == 0] = 255 - - fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20, 7)) - ax[0].imshow(image) - ax[1].imshow(resized_result, cmap="gray") - ax[2].imshow(bg_removed_result) - for a in ax: - a.axis("off") - - - -.. image:: vision-background-removal-with-output_files/vision-background-removal-with-output_22_0.png - - -Add a Background Image -~~~~~~~~~~~~~~~~~~~~~~ - - - -In the segmentation result, all foreground pixels have a value of 1, all -background pixels a value of 0. Replace the background image as follows: - -- Load a new ``background_image``. -- Resize the image to the same size as the original image. -- In ``background_image``, set all the pixels, where the resized - segmentation result has a value of 1 - the foreground pixels in the - original image - to 0. -- Add ``bg_removed_result`` from the previous step - the part of the - original image that only contains foreground pixels - to - ``background_image``. - -.. code:: ipython3 - - BACKGROUND_IMAGE_URL = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/wall.jpg" - BACKGROUND_IMAGE_NAME = "wall.jpg" - OUTPUT_DIR = "output" - - os.makedirs(name=OUTPUT_DIR, exist_ok=True) - - background_image = cv2.cvtColor(src=load_image(BACKGROUND_IMAGE_NAME, BACKGROUND_IMAGE_URL), code=cv2.COLOR_BGR2RGB) - background_image = cv2.resize(src=background_image, dsize=(image.shape[1], image.shape[0])) - - # Set all the foreground pixels from the result to 0 - # in the background image and add the image with the background removed. - background_image[resized_result == 1] = 0 - new_image = background_image + bg_removed_result - - # Save the generated image. - new_image_path = Path(f"{OUTPUT_DIR}/{Path(IMAGE_URI).stem}-{BACKGROUND_IMAGE_NAME}") - cv2.imwrite(filename=str(new_image_path), img=cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)) - - # Display the original image and the image with the new background side by side - fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(18, 7)) - ax[0].imshow(image) - ax[1].imshow(new_image) - for a in ax: - a.axis("off") - plt.show() - - # Create a link to download the image. - image_link = FileLink(new_image_path) - image_link.html_link_str = "%s" - display( - HTML( - f"The generated image {new_image_path.name} is saved in " - f"the directory {new_image_path.parent}. You can also " - "download the image by clicking on this link: " - f"{image_link._repr_html_()}" - ) - ) - - - -.. image:: vision-background-removal-with-output_files/vision-background-removal-with-output_24_0.png - - - -.. raw:: html - - The generated image coco_hollywood-wall.jpg is saved in the directory output. You can also download the image by clicking on this link: output/coco_hollywood-wall.jpg
- - -References ----------- - - - -- `PIP install openvino `__ -- `Model Conversion - API `__ -- `U^2-Net `__ -- U^2-Net research paper: `U^2-Net: Going Deeper with Nested - U-Structure for Salient Object - Detection `__ diff --git a/docs/notebooks/vision-background-removal-with-output_files/vision-background-removal-with-output_22_0.png b/docs/notebooks/vision-background-removal-with-output_files/vision-background-removal-with-output_22_0.png deleted file mode 100644 index 4286b7348b3c31..00000000000000 --- a/docs/notebooks/vision-background-removal-with-output_files/vision-background-removal-with-output_22_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b0da387693da2d280d61ff14733dea210ee830e4401412d15d86a0cd4fe6956a -size 279572 diff --git a/docs/notebooks/vision-background-removal-with-output_files/vision-background-removal-with-output_24_0.png b/docs/notebooks/vision-background-removal-with-output_files/vision-background-removal-with-output_24_0.png deleted file mode 100644 index 664b1d700edf44..00000000000000 --- a/docs/notebooks/vision-background-removal-with-output_files/vision-background-removal-with-output_24_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cf1bdcb14ab146b026f0cb489ca98ae0ac0e74c1c4be09d2c4e0d255c8ec960e -size 927043 diff --git a/docs/notebooks/vision-monodepth-with-output.rst b/docs/notebooks/vision-monodepth-with-output.rst deleted file mode 100644 index ea081a4c54b771..00000000000000 --- a/docs/notebooks/vision-monodepth-with-output.rst +++ /dev/null @@ -1,537 +0,0 @@ -Monodepth Estimation with OpenVINO -================================== - -This tutorial demonstrates Monocular Depth Estimation with MidasNet in -OpenVINO. Model information can be found -`here `__. - -.. figure:: https://user-images.githubusercontent.com/36741649/127173017-a0bbcf75-db24-4d2c-81b9-616e04ab7cd9.gif - :alt: monodepth - - monodepth - -What is Monodepth? -~~~~~~~~~~~~~~~~~~ - -Monocular Depth Estimation is the task of estimating scene depth using a -single image. It has many potential applications in robotics, 3D -reconstruction, medical imaging and autonomous systems. This tutorial -uses a neural network model called -`MiDaS `__, which was developed by -the `Embodied AI Foundation `__. -See the research paper below to learn more. - -R. Ranftl, K. Lasinger, D. Hafner, K. Schindler and V. Koltun, `“Towards -Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot -Cross-dataset -Transfer,” `__ in IEEE -Transactions on Pattern Analysis and Machine Intelligence, doi: -``10.1109/TPAMI.2020.3019967``. - - -**Table of contents:** - - -- `Preparation <#preparation>`__ - - - `Install requirements <#install-requirements>`__ - - `Imports <#imports>`__ - - `Download the model <#download-the-model>`__ - -- `Functions <#functions>`__ -- `Select inference device <#select-inference-device>`__ -- `Load the Model <#load-the-model>`__ -- `Monodepth on Image <#monodepth-on-image>`__ - - - `Load, resize and reshape input - image <#load-resize-and-reshape-input-image>`__ - - `Do inference on the image <#do-inference-on-the-image>`__ - - `Display monodepth image <#display-monodepth-image>`__ - -- `Monodepth on Video <#monodepth-on-video>`__ - - - `Video Settings <#video-settings>`__ - - `Load the Video <#load-the-video>`__ - - `Do Inference on a Video and Create Monodepth - Video <#do-inference-on-a-video-and-create-monodepth-video>`__ - - `Display Monodepth Video <#display-monodepth-video>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Preparation ------------ - - - -Install requirements -~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %pip install -q "openvino>=2023.1.0" - %pip install -q opencv-python requests tqdm - - %pip install -q "matplotlib>=3.4" - - # Fetch `notebook_utils` module - import requests - from pathlib import Path - - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("vision-monodepth.ipynb") - -Imports -~~~~~~~ - - - -.. code:: ipython3 - - import time - - import cv2 - import matplotlib.cm - import matplotlib.pyplot as plt - import numpy as np - from IPython.display import ( - HTML, - FileLink, - Pretty, - ProgressBar, - Video, - clear_output, - display, - ) - import openvino as ov - - from notebook_utils import download_file, load_image, device_widget - -Download the model -~~~~~~~~~~~~~~~~~~ - - - -The model is in the `OpenVINO Intermediate Representation -(IR) `__ -format. - -.. code:: ipython3 - - model_folder = Path("model") - - ir_model_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/depth-estimation-midas/FP32/" - ir_model_name_xml = "MiDaS_small.xml" - ir_model_name_bin = "MiDaS_small.bin" - - download_file(ir_model_url + ir_model_name_xml, filename=ir_model_name_xml, directory=model_folder) - download_file(ir_model_url + ir_model_name_bin, filename=ir_model_name_bin, directory=model_folder) - - model_xml_path = model_folder / ir_model_name_xml - - -.. parsed-literal:: - - 'model/MiDaS_small.xml' already exists. - 'model/MiDaS_small.bin' already exists. - - -Functions ---------- - - - -.. code:: ipython3 - - def normalize_minmax(data): - """Normalizes the values in `data` between 0 and 1""" - return (data - data.min()) / (data.max() - data.min()) - - - def convert_result_to_image(result, colormap="viridis"): - """ - Convert network result of floating point numbers to an RGB image with - integer values from 0-255 by applying a colormap. - - `result` is expected to be a single network result in 1,H,W shape - `colormap` is a matplotlib colormap. - See https://matplotlib.org/stable/tutorials/colors/colormaps.html - """ - cmap = matplotlib.cm.get_cmap(colormap) - result = result.squeeze(0) - result = normalize_minmax(result) - result = cmap(result)[:, :, :3] * 255 - result = result.astype(np.uint8) - return result - - - def to_rgb(image_data) -> np.ndarray: - """ - Convert image_data from BGR to RGB - """ - return cv2.cvtColor(image_data, cv2.COLOR_BGR2RGB) - -Select inference device ------------------------ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO') - - - -Load the Model --------------- - - - -Load the model in OpenVINO Runtime with ``core.read_model`` and compile -it for the specified device with ``core.compile_model``. Get input and -output keys and the expected input shape for the model. - -.. code:: ipython3 - - import openvino.properties as props - - - # Create cache folder - cache_folder = Path("cache") - cache_folder.mkdir(exist_ok=True) - - core = ov.Core() - core.set_property({props.cache_dir(): cache_folder}) - model = core.read_model(model_xml_path) - compiled_model = core.compile_model(model=model, device_name=device.value) - - input_key = compiled_model.input(0) - output_key = compiled_model.output(0) - - network_input_shape = list(input_key.shape) - network_image_height, network_image_width = network_input_shape[2:] - -Monodepth on Image ------------------- - - - -Load, resize and reshape input image -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The input image is read with OpenCV, resized to network input size, and -reshaped to (N,C,H,W) (N=number of images, C=number of channels, -H=height, W=width). - -.. code:: ipython3 - - IMAGE_URL = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bike.jpg" - IMAGE_NAME = "coco_bike.jpg" - - - image = load_image(IMAGE_NAME, IMAGE_URL) - - - # Resize to input shape for network. - resized_image = cv2.resize(src=image, dsize=(network_image_height, network_image_width)) - - # Reshape the image to network input shape NCHW. - input_image = np.expand_dims(np.transpose(resized_image, (2, 0, 1)), 0) - -Do inference on the image -~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Do inference, convert the result to an image, and resize it to the -original image shape. - -.. code:: ipython3 - - result = compiled_model([input_image])[output_key] - - # Convert the network result of disparity map to an image that shows - # distance as colors. - result_image = convert_result_to_image(result=result) - - # Resize back to original image shape. The `cv2.resize` function expects shape - # in (width, height), [::-1] reverses the (height, width) shape to match this. - result_image = cv2.resize(result_image, image.shape[:2][::-1]) - -Display monodepth image -~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - fig, ax = plt.subplots(1, 2, figsize=(20, 15)) - ax[0].imshow(to_rgb(image)) - ax[1].imshow(result_image); - - - -.. image:: vision-monodepth-with-output_files/vision-monodepth-with-output_18_0.png - - -Monodepth on Video ------------------- - - - -By default, only the first 4 seconds are processed in order to quickly -check that everything works. Change ``NUM_SECONDS`` in the cell below to -modify this. Set ``NUM_SECONDS`` to 0 to process the whole video. - -Video Settings -~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # Video source: https://www.youtube.com/watch?v=fu1xcQdJRws (Public Domain) - VIDEO_FILE = "Coco-Walking-in-Berkeley.mp4" - download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/Coco%20Walking%20in%20Berkeley.mp4", - VIDEO_FILE, - ) - - # Number of seconds of input video to process. Set `NUM_SECONDS` to 0 to process - # the full video. - NUM_SECONDS = 4 - # Set `ADVANCE_FRAMES` to 1 to process every frame from the input video - # Set `ADVANCE_FRAMES` to 2 to process every second frame. This reduces - # the time it takes to process the video. - ADVANCE_FRAMES = 2 - # Set `SCALE_OUTPUT` to reduce the size of the result video - # If `SCALE_OUTPUT` is 0.5, the width and height of the result video - # will be half the width and height of the input video. - SCALE_OUTPUT = 0.5 - # The format to use for video encoding. The 'vp09` is slow, - # but it works on most systems. - # Try the `THEO` encoding if you have FFMPEG installed. - # FOURCC = cv2.VideoWriter_fourcc(*"THEO") - FOURCC = cv2.VideoWriter_fourcc(*"vp09") - - # Create Path objects for the input video and the result video. - output_directory = Path("output") - output_directory.mkdir(exist_ok=True) - result_video_path = output_directory / f"{Path(VIDEO_FILE).stem}_monodepth.mp4" - -Load the Video -~~~~~~~~~~~~~~ - - - -Load the video from a ``VIDEO_FILE``, set in the *Video Settings* cell -above. Open the video to read the frame width and height and fps, and -compute values for these properties for the monodepth video. - -.. code:: ipython3 - - cap = cv2.VideoCapture(str(VIDEO_FILE)) - ret, image = cap.read() - if not ret: - raise ValueError(f"The video at {VIDEO_FILE} cannot be read.") - input_fps = cap.get(cv2.CAP_PROP_FPS) - input_video_frame_height, input_video_frame_width = image.shape[:2] - - target_fps = input_fps / ADVANCE_FRAMES - target_frame_height = int(input_video_frame_height * SCALE_OUTPUT) - target_frame_width = int(input_video_frame_width * SCALE_OUTPUT) - - cap.release() - print(f"The input video has a frame width of {input_video_frame_width}, " f"frame height of {input_video_frame_height} and runs at {input_fps:.2f} fps") - print( - "The monodepth video will be scaled with a factor " - f"{SCALE_OUTPUT}, have width {target_frame_width}, " - f" height {target_frame_height}, and run at {target_fps:.2f} fps" - ) - - -.. parsed-literal:: - - The input video has a frame width of 640, frame height of 360 and runs at 30.00 fps - The monodepth video will be scaled with a factor 0.5, have width 320, height 180, and run at 15.00 fps - - -Do Inference on a Video and Create Monodepth Video -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # Initialize variables. - input_video_frame_nr = 0 - start_time = time.perf_counter() - total_inference_duration = 0 - - # Open the input video - cap = cv2.VideoCapture(str(VIDEO_FILE)) - - # Create a result video. - out_video = cv2.VideoWriter( - str(result_video_path), - FOURCC, - target_fps, - (target_frame_width * 2, target_frame_height), - ) - - num_frames = int(NUM_SECONDS * input_fps) - total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT) if num_frames == 0 else num_frames - progress_bar = ProgressBar(total=total_frames) - progress_bar.display() - - try: - while cap.isOpened(): - ret, image = cap.read() - if not ret: - cap.release() - break - - if input_video_frame_nr >= total_frames: - break - - # Only process every second frame. - # Prepare a frame for inference. - # Resize to the input shape for network. - resized_image = cv2.resize(src=image, dsize=(network_image_height, network_image_width)) - # Reshape the image to network input shape NCHW. - input_image = np.expand_dims(np.transpose(resized_image, (2, 0, 1)), 0) - - # Do inference. - inference_start_time = time.perf_counter() - result = compiled_model([input_image])[output_key] - inference_stop_time = time.perf_counter() - inference_duration = inference_stop_time - inference_start_time - total_inference_duration += inference_duration - - if input_video_frame_nr % (10 * ADVANCE_FRAMES) == 0: - clear_output(wait=True) - progress_bar.display() - # input_video_frame_nr // ADVANCE_FRAMES gives the number of - # Frames that have been processed by the network. - display( - Pretty( - f"Processed frame {input_video_frame_nr // ADVANCE_FRAMES}" - f"/{total_frames // ADVANCE_FRAMES}. " - f"Inference time per frame: {inference_duration:.2f} seconds " - f"({1/inference_duration:.2f} FPS)" - ) - ) - - # Transform the network result to a RGB image. - result_frame = to_rgb(convert_result_to_image(result)) - # Resize the image and the result to a target frame shape. - result_frame = cv2.resize(result_frame, (target_frame_width, target_frame_height)) - image = cv2.resize(image, (target_frame_width, target_frame_height)) - # Put the image and the result side by side. - stacked_frame = np.hstack((image, result_frame)) - # Save a frame to the video. - out_video.write(stacked_frame) - - input_video_frame_nr = input_video_frame_nr + ADVANCE_FRAMES - cap.set(1, input_video_frame_nr) - - progress_bar.progress = input_video_frame_nr - progress_bar.update() - - except KeyboardInterrupt: - print("Processing interrupted.") - finally: - clear_output() - processed_frames = num_frames // ADVANCE_FRAMES - out_video.release() - cap.release() - end_time = time.perf_counter() - duration = end_time - start_time - - print( - f"Processed {processed_frames} frames in {duration:.2f} seconds. " - f"Total FPS (including video processing): {processed_frames/duration:.2f}." - f"Inference FPS: {processed_frames/total_inference_duration:.2f} " - ) - print(f"Monodepth Video saved to '{str(result_video_path)}'.") - - -.. parsed-literal:: - - Processed 60 frames in 97.70 seconds. Total FPS (including video processing): 0.61.Inference FPS: 82.48 - Monodepth Video saved to 'output/Coco%20Walking%20in%20Berkeley_monodepth.mp4'. - - -Display Monodepth Video -~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - video = Video(result_video_path, width=800, embed=True) - if not result_video_path.exists(): - plt.imshow(stacked_frame) - raise ValueError("OpenCV was unable to write the video file. Showing one video frame.") - else: - print(f"Showing monodepth video saved at\n{result_video_path.resolve()}") - print("If you cannot see the video in your browser, please click on the " "following link to download the video ") - video_link = FileLink(result_video_path) - video_link.html_link_str = "%s" - display(HTML(video_link._repr_html_())) - display(video) - - -.. parsed-literal:: - - Showing monodepth video saved at - /home/ea/work/openvino_notebooks/notebooks/vision-monodepth/output/Coco%20Walking%20in%20Berkeley_monodepth.mp4 - If you cannot see the video in your browser, please click on the following link to download the video - - - -.. raw:: html - - output/Coco%20Walking%20in%20Berkeley_monodepth.mp4
- - - -.. raw:: html - - - diff --git a/docs/notebooks/vision-monodepth-with-output_files/vision-monodepth-with-output_18_0.png b/docs/notebooks/vision-monodepth-with-output_files/vision-monodepth-with-output_18_0.png deleted file mode 100644 index f10d1b29df755c..00000000000000 --- a/docs/notebooks/vision-monodepth-with-output_files/vision-monodepth-with-output_18_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9c5b82e40b1bf5b8f0dbc2c4ffabd21207bbc1b59c558aace759c3ab5ca6787f -size 959858 diff --git a/docs/notebooks/wav2lip-with-output.rst b/docs/notebooks/wav2lip-with-output.rst deleted file mode 100644 index 23266858f64053..00000000000000 --- a/docs/notebooks/wav2lip-with-output.rst +++ /dev/null @@ -1,265 +0,0 @@ -Wav2Lip: Accurately Lip-syncing Videos and OpenVINO -=================================================== - -Lip sync technologies are widely used for digital human use cases, which -enhance the user experience in dialog scenarios. - -`Wav2Lip `__ is an approach to -generate accurate 2D lip-synced videos in the wild with only one video -and an audio clip. Wav2Lip leverages an accurate lip-sync “expert” model -and consecutive face frames for accurate, natural lip motion generation. - -.. figure:: https://github.com/user-attachments/assets/11d2fb00-4b5a-45f3-b13b-49636b0d48b1 - :alt: teaser - - teaser - -In this notebook, we introduce how to enable and optimize -Wav2Lippipeline with OpenVINO. This is adaptation of the blog article -`Enable 2D Lip Sync Wav2Lip Pipeline with OpenVINO -Runtime `__. - -Here is Wav2Lip pipeline overview: - -.. figure:: https://cdn.prod.website-files.com/62c72c77b482b372ac273024/669487bc70c2767fbb9b6c8e_wav2lip_pipeline.png - :alt: wav2lip_pipeline - - wav2lip_pipeline - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Convert the model to OpenVINO - IR <#convert-the-model-to-openvino-ir>`__ -- `Compiling models and prepare - pipeline <#compiling-models-and-prepare-pipeline>`__ -- `Interactive inference <#interactive-inference>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("pip_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/pip_helper.py", - ) - open("pip_helper.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - from pip_helper import pip_install - - pip_install("-q", "openvino>=2024.4.0") - pip_install( - "-q", - "huggingface_hub", - "torch>=2.1", - "gradio>=4.19", - "librosa==0.9.2", - "opencv-contrib-python", - "opencv-python", - "tqdm", - "numba", - "numpy<2", - "--extra-index-url", - "https://download.pytorch.org/whl/cpu", - ) - - helpers = ["gradio_helper.py", "ov_inference.py", "ov_wav2lip_helper.py"] - for helper_file in helpers: - if not Path(helper_file).exists(): - r = requests.get(url=f"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/wav2lip/{helper_file}") - open(helper_file, "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("wav2lip.ipynb") - -.. code:: ipython3 - - from cmd_helper import clone_repo - - - clone_repo("https://github.com/Rudrabha/Wav2Lip.git") - -Download example files. - -.. code:: ipython3 - - from notebook_utils import download_file - - - download_file("https://github.com/sammysun0711/openvino_aigc_samples/blob/main/Wav2Lip/data_audio_sun_5s.wav?raw=true") - download_file("https://github.com/sammysun0711/openvino_aigc_samples/blob/main/Wav2Lip/data_video_sun_5s.mp4?raw=true") - -Convert the model to OpenVINO IR -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -You don’t need to download checkpoints and load models, just call the -helper function ``download_and_convert_models``. It takes care about it -and will convert both model in OpenVINO format. - -.. code:: ipython3 - - from ov_wav2lip_helper import download_and_convert_models - - - OV_FACE_DETECTION_MODEL_PATH = Path("models/face_detection.xml") - OV_WAV2LIP_MODEL_PATH = Path("models/wav2lip.xml") - - download_and_convert_models(OV_FACE_DETECTION_MODEL_PATH, OV_WAV2LIP_MODEL_PATH) - -Compiling models and prepare pipeline -------------------------------------- - - - -Select device from dropdown list for running inference using OpenVINO. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -``ov_inference.py`` is an adaptation of original pipeline that has only -cli-interface. ``ov_inference`` allows running the inference using -python API and converted OpenVINO models. - -.. code:: ipython3 - - import os - - from ov_inference import ov_inference - - - if not os.path.exists("results"): - os.mkdir("results") - - ov_inference( - "data_video_sun_5s.mp4", - "data_audio_sun_5s.wav", - face_detection_path=OV_FACE_DETECTION_MODEL_PATH, - wav2lip_path=OV_WAV2LIP_MODEL_PATH, - inference_device=device.value, - outfile="results/result_voice.mp4", - ) - -Here is an example to compare the original video and the generated video -after the Wav2Lip pipeline: - -.. code:: ipython3 - - from IPython.display import Video, Audio - - Video("data_video_sun_5s.mp4", embed=True) - - - - -.. raw:: html - - - - - -.. code:: ipython3 - - Audio("data_audio_sun_5s.wav") - - - - -.. raw:: html - - - - - - - -The generated video: - -.. code:: ipython3 - - Video("results/result_voice.mp4", embed=True) - - - - -.. raw:: html - - - - - -Interactive inference ---------------------- - - - -.. code:: ipython3 - - from gradio_helper import make_demo - - - demo = make_demo(fn=ov_inference) - - try: - demo.queue().launch(debug=True) - except Exception: - demo.queue().launch(debug=True, share=True) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/" diff --git a/docs/notebooks/whisper-asr-genai-with-output.rst b/docs/notebooks/whisper-asr-genai-with-output.rst deleted file mode 100644 index 0e12c6f772729f..00000000000000 --- a/docs/notebooks/whisper-asr-genai-with-output.rst +++ /dev/null @@ -1,997 +0,0 @@ -Automatic speech recognition using Whisper and OpenVINO with Generate API -========================================================================= - -`Whisper `__ is an automatic speech -recognition (ASR) system trained on 680,000 hours of multilingual and -multitask supervised data collected from the web. - -Whisper is a Transformer based encoder-decoder model, also referred to -as a sequence-to-sequence model. It maps a sequence of audio spectrogram -features to a sequence of text tokens. First, the raw audio inputs are -converted to a log-Mel spectrogram by action of the feature extractor. -Then, the Transformer encoder encodes the spectrogram to form a sequence -of encoder hidden states. Finally, the decoder autoregressively predicts -text tokens, conditional on both the previous tokens and the encoder -hidden states. - -You can see the model architecture in the diagram below: - -.. figure:: https://user-images.githubusercontent.com/29454499/204536571-8f6d8d77-5fbd-4c6d-8e29-14e734837860.svg - :alt: whisper_architecture.svg - - whisper_architecture.svg - -In this tutorial, we consider how to run Whisper using OpenVINO. We will -use the pre-trained model from the `Hugging Face -Transformers `__ -library. The `Hugging Face Optimum -Intel `__ library -converts the models to OpenVINO™ IR format. To simplify the user -experience, we will use `OpenVINO Generate -API `__ for `Whisper -automatic speech recognition -scenarios `__. - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Load PyTorch model <#load-pytorch-model>`__ - - - `Run PyTorch model inference <#run-pytorch-model-inference>`__ - -- `Download and convert model to OpenVINO IR via Optimum Intel - CLI <#download-and-convert-model-to-openvino-ir-via-optimum-intel-cli>`__ -- `Run inference OpenVINO model with - WhisperPipeline <#run-inference-openvino-model-with-whisperpipeline>`__ -- `Compare performance PyTorch vs - OpenVINO <#compare-performance-pytorch-vs-openvino>`__ -- `Quantization <#quantization>`__ - - - `Prepare calibration datasets <#prepare-calibration-datasets>`__ - - `Quantize Whisper encoder and decoder - models <#quantize-whisper-encoder-and-decoder-models>`__ - - `Run quantized model inference <#run-quantized-model-inference>`__ - - `Compare performance and accuracy of the original and quantized - models <#compare-performance-and-accuracy-of-the-original-and-quantized-models>`__ - -- `Interactive demo <#interactive-demo>`__ - -Prerequisites -------------- - - - -.. code:: ipython3 - - import platform - - - %pip install -q "torch>=2.3" "torchvision>=0.18.1" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q -U "transformers>=4.45" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install --pre -q -U "openvino>=2024.5.0" "openvino-tokenizers>=2024.5.0" "openvino-genai>=2024.5.0" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - %pip install -q datasets "gradio>=4.0" "soundfile>=0.12" "librosa" "python-ffmpeg<=1.0.16" - %pip install -q "nncf>=2.14.0" "jiwer" "typing_extensions>=4.9" - if platform.system() == "Darwin": - %pip install -q "numpy<2.0" - - from transformers.utils.import_utils import is_tf_available - - if is_tf_available(): - %pip install -q "numpy<2.0" - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("whisper-asr-genai.ipynb") - -Load PyTorch model ------------------- - - - -The ``AutoModelForSpeechSeq2Seq.from_pretrained`` method is used for the -initialization of PyTorch Whisper model using the transformers library. -The model will be downloaded once during first run and this process may -require some time. - -You may also choose other models from `Whisper -collection `__, -more on them `here `__. - -Preprocessing and post-processing are important in this model use. -``AutoProcessor`` class used for initialization ``WhisperProcessor`` is -responsible for preparing audio input data for the model, converting it -to Mel-spectrogram and decoding predicted output token_ids into string -using tokenizer. We will use ``pipeline`` method to transcribe audios of -arbitrary length. - -.. code:: ipython3 - - import ipywidgets as widgets - - model_ids = { - "Multilingual models": [ - "openai/whisper-large-v3-turbo", - "openai/whisper-large-v3", - "openai/whisper-large-v2", - "openai/whisper-large", - "openai/whisper-medium", - "openai/whisper-small", - "openai/whisper-base", - "openai/whisper-tiny", - ], - "English-only models": [ - "distil-whisper/distil-large-v2", - "distil-whisper/distil-large-v3", - "distil-whisper/distil-medium.en", - "distil-whisper/distil-small.en", - "openai/whisper-medium.en", - "openai/whisper-small.en", - "openai/whisper-base.en", - "openai/whisper-tiny.en", - ], - } - - model_type = widgets.Dropdown( - options=model_ids.keys(), - value="Multilingual models", - description="Model:", - disabled=False, - ) - - model_type - - - - -.. parsed-literal:: - - Dropdown(description='Model:', options=('Multilingual models', 'English-only models'), value='Multilingual mod… - - - -.. code:: ipython3 - - model_id = widgets.Dropdown( - options=model_ids[model_type.value], - value=model_ids[model_type.value][-1], - description="Model:", - disabled=False, - ) - - model_id - - - - -.. parsed-literal:: - - Dropdown(description='Model:', index=7, options=('openai/whisper-large-v3-turbo', 'openai/whisper-large-v3', '… - - - -.. code:: ipython3 - - from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline - from transformers.utils import logging - import torch - - processor = AutoProcessor.from_pretrained(model_id.value) - - pt_model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id.value) - - pipe_pt = pipeline( - "automatic-speech-recognition", - model=pt_model, - tokenizer=processor.tokenizer, - feature_extractor=processor.feature_extractor, - device=torch.device("cpu"), - ) - -Run PyTorch model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The ``pipeline`` expects audio data in numpy array format. We will use -``.wav`` file and convert it numpy array format for that purpose. - -.. code:: ipython3 - - from notebook_utils import download_file - - en_example_short = Path("data", "courtroom.wav") - - if not en_example_short.exists(): - # a wav sample - download_file( - "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/courtroom.wav", - en_example_short.name, - directory=en_example_short.parent, - ) - - -.. parsed-literal:: - - 'data/courtroom.wav' already exists. - - - - -.. parsed-literal:: - - PosixPath('/home/labuser/work/notebook/openvino_notebooks/notebooks/whisper-asr-genai/data/courtroom.wav') - - - -.. code:: ipython3 - - import librosa - - en_raw_speech, samplerate = librosa.load(str(en_example_short), sr=16000) - -Let’s check how to work the ``transcribe`` task. - -.. code:: ipython3 - - import copy - import IPython.display as ipd - - logging.set_verbosity_error() - - sample = copy.deepcopy(en_raw_speech) - - display(ipd.Audio(sample, rate=samplerate)) - - pt_result = pipe_pt(sample) - print(f"Result: {pt_result['text']}") - - - -.. raw:: html - - - - - - -.. parsed-literal:: - - /home/labuser/work/notebook/whisper_new/lib/python3.10/site-packages/transformers/models/whisper/generation_whisper.py:496: FutureWarning: The input name `inputs` is deprecated. Please make sure to use `input_features` instead. - warnings.warn( - - -.. parsed-literal:: - - Result: Colonel Jessif, did you order the code rate? You don't have to answer that question. I'll answer the question. You want answers? I think I'm entitled. You want answers? I want the truth. You can't handle the truth. - - -If the multilingual model was chosen, let’s see how task ``translate`` -is working. We will use ``facebook/multilingual_librispeech`` -multilingual dataset, so you can choose the language. The model will -translate audio from the selected language into English. Conversion of -audio to numpy format is handled by Hugging Face datasets -implementation. A complete list of languages ​​supported by the model can -be found in the `paper `__. - -.. code:: ipython3 - - import ipywidgets as widgets - - languages = {"japanese": "ja_jp", "dutch": "da_dk", "french": "fr_fr", "spanish": "ca_es", "italian": "it_it", "portuguese": "pt_br", "polish": "pl_pl"} - - SAMPLE_LANG = None - if model_type.value == "Multilingual models": - SAMPLE_LANG = widgets.Dropdown( - options=languages.keys(), - value="italian", - description="Dataset language:", - disabled=False, - ) - - SAMPLE_LANG - - - - -.. parsed-literal:: - - Dropdown(description='Dataset language:', index=4, options=('japanese', 'dutch', 'french', 'spanish', 'italian… - - - -.. code:: ipython3 - - from datasets import load_dataset - - mls_dataset = None - if model_type.value == "Multilingual models": - mls_dataset = load_dataset("google/fleurs", languages[SAMPLE_LANG.value], split="test", streaming=True, trust_remote_code=True) - mls_dataset = iter(mls_dataset) # make it iterable - mls_example = next(mls_dataset) # get one example - -.. code:: ipython3 - - if model_type.value == "Multilingual models": - sample = copy.deepcopy(mls_example["audio"]) - - display(ipd.Audio(sample["array"], rate=sample["sampling_rate"])) - print(f"Reference: {mls_example['raw_transcription']}") - - pt_result = pipe_pt(sample, generate_kwargs={"task": "translate"}) - print(f"\nResult: {pt_result['text']}") - - - -.. raw:: html - - - - - - -.. parsed-literal:: - - Reference: Il blog è uno strumento che si prefigge di incoraggiare la collaborazione e sviluppare l'apprendimento degli studenti ben oltre la giornata scolastica normale. - - Result: The blog is our tool that is prefilled to encourage collaboration and develop the learning of the students and to attract a normal school class. - - -Convert model to OpenVINO IR via Optimum Intel CLI --------------------------------------------------- - - - -Listed Whisper model are available for downloading via the `HuggingFace -hub `__. We will use optimum-cli -interface for exporting it into OpenVINO Intermediate Representation -(IR) format. - -Optimum CLI interface for converting models supports export to OpenVINO -(supported starting optimum-intel 1.12 version). General command format: - -.. code:: bash - - optimum-cli export openvino --model --task - -where ``--model`` argument is model id from HuggingFace Hub or local -directory with model (saved using ``.save_pretrained`` method), -``--task`` is one of `supported -task `__ -that exported model should solve. For LLMs it will be -``automatic-speech-recognition-with-past``. If model initialization -requires to use remote code, ``--trust-remote-code`` flag additionally -should be passed. Full list of supported arguments available via -``--help`` For more details and examples of usage, please check `optimum -documentation `__. - -.. code:: ipython3 - - import logging - import nncf - from cmd_helper import optimum_cli - - nncf.set_log_level(logging.ERROR) - - model_path = Path(model_id.value.split("/")[1]) - optimum_cli(model_id.value, model_path) - print(f"✅ {model_id.value} model converted and can be found in {model_path}") - -Run inference OpenVINO model with WhisperPipeline -------------------------------------------------- - - - -To simplify user experience we will use `OpenVINO Generate -API `__. -Firstly we will create pipeline with ``WhisperPipeline``. You can -construct it straight away from the folder with the converted model. It -will automatically load the ``model``, ``tokenizer``, ``detokenizer`` -and default ``generation configuration``. - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget(default="CPU", exclude=["NPU"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -.. code:: ipython3 - - import openvino_genai as ov_genai - - ov_pipe = ov_genai.WhisperPipeline(str(model_path), device=device.value) - -Let’s run the ``transcribe`` task. We just call ``generate`` for that -and put array as input. - -.. code:: ipython3 - - genai_result = ov_pipe.generate(en_raw_speech) - - display(ipd.Audio(en_raw_speech, rate=samplerate)) - print(f"Result: {genai_result}") - - - -.. raw:: html - - - - - - -.. parsed-literal:: - - Result: Colonel Jessif, did you order the code rate? You don't have to answer that question. I'll answer the question. You want answers? I think I'm entitled. You want answers? I want the truth. You can't handle the truth. - - -Whisper could provide a phrase-level timestamps for audio. Let’s try -this scenario, we will specify ``return_timestamps=True`` for -``generate`` method. - -``generate`` method with ``return_timestamps`` set to ``True`` will -return ``chunks``, which contain attributes: ``text``, ``start_ts`` and -``end_ts`` in seconds. - -.. code:: ipython3 - - genai_result_timestamps = ov_pipe.generate(en_raw_speech, return_timestamps=True) - - for segment in genai_result_timestamps.chunks: - print(f"{segment.start_ts}sec. ---> {segment.end_ts}sec.") - print(f"{segment.text}\n") - - -.. parsed-literal:: - - 0.0sec. ---> 3.0sec. - Colonel Jessif, did you order the code rate? - - 3.0sec. ---> 4.5sec. - You don't have to answer that question. - - 4.5sec. ---> 6.5sec. - I'll answer the question. - - 6.5sec. ---> 8.0sec. - You want answers? - - 8.0sec. ---> 9.0sec. - I think I'm entitled. - - 9.0sec. ---> 10.0sec. - You want answers? - - 10.0sec. ---> 11.0sec. - I want the truth. - - 11.0sec. ---> 13.0sec. - You can't handle the truth. - - - -Let’s see how to work the ``translate`` task. It supports for -multilingual models only. For that case we will specify ``language`` and -``task`` options. We can do this in different ways. We can get default -config with ``get_generation_config()``, setup parameters and put config -directly to ``generate()``. It’s also possible to specify the needed -options just as inputs in the ``generate()`` method and we will use this -way. Then we just run ``generate`` method and get the output in text -format. - -.. code:: ipython3 - - languages_genai = { - "japanese": "<|ja|>", - "dutch": "<|da|>", - "french": "<|fr|>", - "spanish": "<|es|>", - "italian": "<|it|>", - "portuguese": "<|pt|>", - "polish": "<|pl|>", - } - - if model_type.value == "Multilingual models": - sample = mls_example["audio"] - - genai_result_ml = ov_pipe.generate(sample["array"], max_new_tokens=100, task="translate", language=languages_genai[SAMPLE_LANG.value]) - - display(ipd.Audio(sample["array"], rate=sample["sampling_rate"])) - print(f"Reference: {mls_example['raw_transcription']}") - print(f"\nResult: {genai_result_ml}") - - - -.. raw:: html - - - - - - -.. parsed-literal:: - - Reference: Il blog è uno strumento che si prefigge di incoraggiare la collaborazione e sviluppare l'apprendimento degli studenti ben oltre la giornata scolastica normale. - - Result: The blog is our tool that is prefilled to encourage collaboration and develop the learning of the students and to attract a normal school class. - - -Compare performance PyTorch vs OpenVINO ---------------------------------------- - - - -.. code:: ipython3 - - import time - import numpy as np - from tqdm.notebook import tqdm - - - def measure_perf(pipe, n=10, model_type="ov"): - timers = [] - for _ in tqdm(range(n), desc="Measuring performance"): - sample = copy.deepcopy(en_raw_speech) - start = time.perf_counter() - if model_type == "pt": - pipe(sample) - elif model_type == "ov": - pipe.generate(sample) - end = time.perf_counter() - timers.append(end - start) - return np.median(timers) - -.. code:: ipython3 - - perf_torch = measure_perf(pipe_pt, model_type="pt") - perf_ov = measure_perf(ov_pipe) - - - -.. parsed-literal:: - - Measuring performance: 0%| | 0/10 [00:00`__ enables -post-training quantization by adding the quantization layers into the -model graph and then using a subset of the training dataset to -initialize the parameters of these additional quantization layers. The -framework is designed so that modifications to your original training -code are minor. - -The optimization process contains the following steps: - -1. Create a calibration dataset for quantization. -2. Run ``nncf.quantize`` to obtain quantized encoder and decoder models. -3. Serialize the ``INT8`` model using ``openvino.save_model`` function. - -.. - - **Note**: Quantization is time and memory consuming operation. - Running quantization code below may take some time. - -Please select below whether you would like to run Whisper quantization. - -.. code:: ipython3 - - from notebook_utils import quantization_widget - - to_quantize = quantization_widget() - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - import requests - - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - ov_quantized_pipe = None - - %load_ext skip_kernel_extension - -Let’s load converted OpenVINO model format using Optimum-Intel to easily -quantize it. - -Optimum Intel can be used to load optimized models from the `Hugging -Face Hub `__ or -local folder to create pipelines to run an inference with OpenVINO -Runtime using Hugging Face APIs. The Optimum Inference models are API -compatible with Hugging Face Transformers models. This means we just -need to replace the ``AutoModelForXxx`` class with the corresponding -``OVModelForXxx`` class. - -Below is an example of the whisper-tiny model - -.. code:: diff - - -from transformers import AutoModelForSpeechSeq2Seq - +from optimum.intel.openvino import OVModelForSpeechSeq2Seq - from transformers import AutoTokenizer, pipeline - - model_id = "openai/whisper-tiny" - -model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id) - +model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True) - -Like the original PyTorch model, the OpenVINO model is also compatible -with HuggingFace -`pipeline `__ -interface for ``automatic-speech-recognition``. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from optimum.intel.openvino import OVModelForSpeechSeq2Seq - - ov_model = OVModelForSpeechSeq2Seq.from_pretrained(str(model_path), device=device.value) - ov_processor = AutoProcessor.from_pretrained(str(model_path)) - -Prepare calibration datasets -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -First step is to prepare calibration datasets for quantization. Since we -quantize whisper encoder and decoder separately, we need to prepare a -calibration dataset for each of the models. We import an -``InferRequestWrapper`` class that will intercept model inputs and -collect them to a list. Then we run model inference on some small amount -of audio samples. Generally, increasing the calibration dataset size -improves quantization quality. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from itertools import islice - from optimum.intel.openvino.quantization import InferRequestWrapper - - - def collect_calibration_dataset(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int): - # Overwrite model request properties, saving the original ones for restoring later - encoder_calibration_data = [] - decoder_calibration_data = [] - ov_model.encoder.request = InferRequestWrapper(ov_model.encoder.request, encoder_calibration_data, apply_caching=True) - ov_model.decoder.request = InferRequestWrapper(ov_model.decoder.request, - decoder_calibration_data, - apply_caching=True) - - pipe = pipeline( - "automatic-speech-recognition", - model=ov_model, - chunk_length_s=30, - tokenizer=ov_processor.tokenizer, - feature_extractor=ov_processor.feature_extractor, - device=torch.device("cpu") - ) - try: - calibration_dataset = dataset = load_dataset("openslr/librispeech_asr", "clean", split="validation", streaming=True, trust_remote_code=True) - for sample in tqdm(islice(calibration_dataset, calibration_dataset_size), desc="Collecting calibration data", - total=calibration_dataset_size): - pipe(sample["audio"], return_timestamps=True) - finally: - ov_model.encoder.request = ov_model.encoder.request.request - ov_model.decoder.request = ov_model.decoder.request.request - - return encoder_calibration_data, decoder_calibration_data - -Quantize Whisper encoder and decoder models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Below we run the ``quantize`` function which calls ``nncf.quantize`` on -Whisper encoder and decoder-with-past models. We don’t quantize -first-step-decoder because its share in whole inference time is -negligible. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import gc - import shutil - import nncf - import openvino as ov - from datasets import load_dataset - from tqdm.notebook import tqdm - - - - CALIBRATION_DATASET_SIZE = 30 - quantized_model_path = Path(f"{model_path}-quantized") - - - def quantize(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int): - if not quantized_model_path.exists(): - encoder_calibration_data, decoder_calibration_data = collect_calibration_dataset( - ov_model, calibration_dataset_size - ) - print("Quantizing encoder") - quantized_encoder = nncf.quantize( - ov_model.encoder.model, - nncf.Dataset(encoder_calibration_data), - subset_size=len(encoder_calibration_data), - model_type=nncf.ModelType.TRANSFORMER, - # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search - advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.80) - ) - ov.save_model(quantized_encoder, quantized_model_path / "openvino_encoder_model.xml") - del quantized_encoder - del encoder_calibration_data - gc.collect() - - print("Quantizing decoder") - quantized_decoder = nncf.quantize( - ov_model.decoder.model, - nncf.Dataset(decoder_calibration_data), - subset_size=len(decoder_calibration_data), - model_type=nncf.ModelType.TRANSFORMER, - # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search - advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.96) - ) - ov.save_model(quantized_decoder, quantized_model_path / "openvino_decoder_model.xml") - del quantized_decoder - del decoder_calibration_data - gc.collect() - - # Copy the config file and the first-step-decoder manually - shutil.copy(model_path / "config.json", quantized_model_path / "config.json") - shutil.copy(model_path / "generation_config.json", quantized_model_path / "generation_config.json") - shutil.copy(model_path / "openvino_decoder_model.xml", quantized_model_path / "openvino_decoder_model.xml") - shutil.copy(model_path / "openvino_decoder_model.bin", quantized_model_path / "openvino_decoder_model.bin") - shutil.copy(model_path / "openvino_tokenizer.xml", quantized_model_path / "openvino_tokenizer.xml") - shutil.copy(model_path / "openvino_tokenizer.bin", quantized_model_path / "openvino_tokenizer.bin") - shutil.copy(model_path / "openvino_detokenizer.xml", quantized_model_path / "openvino_detokenizer.xml") - shutil.copy(model_path / "openvino_detokenizer.bin", quantized_model_path / "openvino_detokenizer.bin") - shutil.copy(model_path / "tokenizer_config.json", quantized_model_path / "tokenizer_config.json") - shutil.copy(model_path / "tokenizer.json", quantized_model_path / "tokenizer.json") - shutil.copy(model_path / "vocab.json", quantized_model_path / "vocab.json") - shutil.copy(model_path / "preprocessor_config.json", quantized_model_path / "preprocessor_config.json") - shutil.copy(model_path / "special_tokens_map.json", quantized_model_path / "special_tokens_map.json") - shutil.copy(model_path / "normalizer.json", quantized_model_path / "normalizer.json") - shutil.copy(model_path / "merges.txt", quantized_model_path / "merges.txt") - shutil.copy(model_path / "added_tokens.json", quantized_model_path / "added_tokens.json") - - quantized_ov_pipe = ov_genai.WhisperPipeline(str(quantized_model_path), device=device.value) - return quantized_ov_pipe - - - ov_quantized_pipe = quantize(ov_model, CALIBRATION_DATASET_SIZE) - -Run quantized model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Let’s compare the transcription results for original and quantized -models. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - genai_result = ov_pipe.generate(en_raw_speech) - quantized_genai_result = ov_quantized_pipe.generate(en_raw_speech) - - display(ipd.Audio(en_raw_speech, rate=samplerate)) - - print(f"Original : {genai_result}") - print(f"Quantized: {quantized_genai_result}") - - - -.. raw:: html - - - - - - -.. parsed-literal:: - - Original : Colonel Jessif, did you order the code rate? You don't have to answer that question. I'll answer the question. You want answers? I think I'm entitled. You want answers? I want the truth. You can't handle the truth. - Quantized: Don, I'll just, if you order the code right. You don have to answer that question. I'll answer the question. You want answers. I think I'm entitled you want answer. I want the truth. You can't handle the truth. You can't handle the truth. - - -Compare performance and accuracy of the original and quantized models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Finally, we compare original and quantized Whisper models from accuracy -and performance stand-points. - -To measure accuracy, we use ``1 - WER`` as a metric, where WER stands -for Word Error Rate. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import time - from contextlib import contextmanager - from jiwer import wer, wer_standardize - - - TEST_DATASET_SIZE = 50 - - def calculate_transcription_time_and_accuracy(ov_model, test_samples): - whole_infer_times = [] - - ground_truths = [] - predictions = [] - for data_item in tqdm(test_samples, desc="Measuring performance and accuracy"): - - start_time = time.perf_counter() - transcription = ov_model.generate(data_item["audio"]["array"]) - end_time = time.perf_counter() - whole_infer_times.append(end_time - start_time) - - ground_truths.append(data_item["text"]) - predictions.append(transcription.texts[0]) - - word_accuracy = (1 - wer(ground_truths, predictions, reference_transform=wer_standardize, - hypothesis_transform=wer_standardize)) * 100 - mean_whole_infer_time = sum(whole_infer_times) - return word_accuracy, mean_whole_infer_time - - test_dataset = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True) - test_dataset = test_dataset.shuffle(seed=42).take(TEST_DATASET_SIZE) - test_samples = [sample for sample in test_dataset] - - accuracy_original, times_original = calculate_transcription_time_and_accuracy(ov_pipe, test_samples) - accuracy_quantized, times_quantized = calculate_transcription_time_and_accuracy(ov_quantized_pipe, test_samples) - print(f"Whole pipeline performance speedup: {times_original / times_quantized:.3f}") - print(f"Whisper transcription word accuracy. Original model: {accuracy_original:.2f}%. Quantized model: {accuracy_quantized:.2f}%.") - print(f"Accuracy drop: {accuracy_original - accuracy_quantized:.2f}%.") - - - -.. parsed-literal:: - - Measuring performance and accuracy: 0%| | 0/50 [00:00`__ is an automatic speech -recognition (ASR) system trained on 680,000 hours of multilingual and -multitask supervised data collected from the web. It is a multi-task -model that can perform multilingual speech recognition as well as speech -translation and language identification. - -.. figure:: https://user-images.githubusercontent.com/29454499/204536347-28976978-9a07-416c-acff-fc1214bbfbe0.svg - :alt: asr-training-data-desktop.svg - - asr-training-data-desktop.svg - -You can find more information about this model in the `research -paper `__, `OpenAI -blog `__, `model -card `__ and -GitHub `repository `__. - -In this notebook, we will use Whisper model with `OpenVINO Generate -API `__ for `Whisper -automatic speech recognition -scenarios `__ -to generate subtitles in a sample video. Additionally, we will use -`NNCF `__ improving model -performance by INT8 quantization. Notebook contains the following steps: -1. Download the model. 2. Instantiate the PyTorch model pipeline. 3. -Convert model to OpenVINO IR, using model conversion API. 4. Run the -Whisper pipeline with OpenVINO models. 5. Quantize the OpenVINO model -with NNCF. 6. Check quantized model result for the demo video. 7. -Compare model size, performance and accuracy of FP32 and quantized INT8 -models. 8. Launch Interactive demo for video subtitles generation. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Instantiate model <#instantiate-model>`__ - - - `Convert model to OpenVINO Intermediate Representation (IR) - format. <#convert-model-to-openvino-intermediate-representation-ir-format->`__ - -- `Prepare inference pipeline <#prepare-inference-pipeline>`__ - - - `Select inference device <#select-inference-device>`__ - -- `Run video transcription - pipeline <#run-video-transcription-pipeline>`__ -- `Quantization <#quantization>`__ - - - `Prepare calibration datasets <#prepare-calibration-datasets>`__ - - `Quantize Whisper encoder and decoder - models <#quantize-whisper-encoder-and-decoder-models>`__ - - `Run quantized model inference <#run-quantized-model-inference>`__ - - `Compare performance and accuracy of the original and quantized - models <#compare-performance-and-accuracy-of-the-original-and-quantized-models>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -Install dependencies. - -.. code:: ipython3 - - import platform - import importlib.metadata - import importlib.util - - %pip install -q "nncf>=2.14.0" - %pip install --pre -q -U "openvino>=2024.5.0" "openvino-tokenizers>=2024.5.0" "openvino-genai>=2024.5.0" - %pip install -q -U "python-ffmpeg<=1.0.16" "ffmpeg" "moviepy" "transformers>=4.45" "git+https://github.com/huggingface/optimum-intel.git" "torch>=2.1" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q -U "yt_dlp>=2024.8.6" soundfile librosa jiwer packaging - %pip install -q -U "gradio>=4.19" "typing_extensions>=4.9" - - if platform.system() == "Darwin": - %pip install -q "numpy<2.0" - - - from packaging import version - - try: - if ( - importlib.util.find_spec("tensorflow") is not None - and version.parse(importlib.metadata.version("tensorflow")) < version.parse("2.18.0") - and version.parse(importlib.metadata.version("numpy")) >= version.parse("2.0.0") - ): - %pip uninstall -q -y tensorflow - except importlib.metadata.PackageNotFoundError: - %pip uninstall -q -y tensorflow - -.. code:: ipython3 - - import requests - from pathlib import Path - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("whisper-subtitles-generation.ipynb") - -Instantiate model ------------------ - - - -Whisper is a Transformer based encoder-decoder model, also referred to -as a sequence-to-sequence model. It maps a sequence of audio spectrogram -features to a sequence of text tokens. First, the raw audio inputs are -converted to a log-Mel spectrogram by action of the feature extractor. -Then, the Transformer encoder encodes the spectrogram to form a sequence -of encoder hidden states. Finally, the decoder autoregressively predicts -text tokens, conditional on both the previous tokens and the encoder -hidden states. - -You can see the model architecture in the diagram below: - -.. figure:: https://user-images.githubusercontent.com/29454499/204536571-8f6d8d77-5fbd-4c6d-8e29-14e734837860.svg - :alt: whisper_architecture.svg - - whisper_architecture.svg - -There are several models of different sizes and capabilities trained by -the authors of the model. In this tutorial, we will use the ``tiny`` -model, but the same actions are also applicable to other models from -Whisper family. - -.. code:: ipython3 - - import ipywidgets as widgets - - MODELS = [ - "openai/whisper-large-v3-turbo", - "openai/whisper-large-v3", - "openai/whisper-large-v2", - "openai/whisper-large", - "openai/whisper-medium", - "openai/whisper-small", - "openai/whisper-base", - "openai/whisper-tiny", - ] - - model_id = widgets.Dropdown( - options=list(MODELS), - value="openai/whisper-tiny", - description="Model:", - disabled=False, - ) - - model_id - - - - -.. parsed-literal:: - - Dropdown(description='Model:', index=7, options=('openai/whisper-large-v3-turbo', 'openai/whisper-large-v3', '… - - - -Convert model to OpenVINO Intermediate Representation (IR) format using Optimum-Intel. -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Listed Whisper model are available for downloading via the `HuggingFace -hub `__. We will use optimum-cli -interface for exporting it into OpenVINO Intermediate Representation -(IR) format. - -Optimum CLI interface for converting models supports export to OpenVINO -(supported starting optimum-intel 1.12 version). General command format: - -.. code:: bash - - optimum-cli export openvino --model --task - -where ``--model`` argument is model id from HuggingFace Hub or local -directory with model (saved using ``.save_pretrained`` method), -``--task`` is one of `supported -task `__ -that exported model should solve. For LLMs it will be -``automatic-speech-recognition-with-past``. If model initialization -requires to use remote code, ``--trust-remote-code`` flag additionally -should be passed. Full list of supported arguments available via -``--help`` For more details and examples of usage, please check `optimum -documentation `__. - -.. code:: ipython3 - - from cmd_helper import optimum_cli - - model_dir = model_id.value.split("/")[-1] - - if not Path(model_dir).exists(): - optimum_cli(model_id.value, model_dir) - -Prepare inference pipeline --------------------------- - - - -The image below illustrates the pipeline of video transcribing using the -Whisper model. - -.. figure:: https://user-images.githubusercontent.com/29454499/204536733-1f4342f7-2328-476a-a431-cb596df69854.png - :alt: whisper_pipeline.png - - whisper_pipeline.png - -To simplify user experience we will use `OpenVINO Generate -API `__. -Firstly we will create pipeline with ``WhisperPipeline``. You can -construct it straight away from the folder with the converted model. It -will automatically load the ``model``, ``tokenizer``, ``detokenizer`` -and default ``generation configuration``. - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget(default="CPU", exclude=["NPU"]) - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') - - - -.. code:: ipython3 - - import openvino_genai as ov_genai - - ov_pipe = ov_genai.WhisperPipeline(str(model_dir), device=device.value) - -Run video transcription pipeline --------------------------------- - - - -Now, we are ready to start transcription. Let’s load the video first. - -.. code:: ipython3 - - from notebook_utils import download_file - - output_file = Path("downloaded_video.mp4") - - if not output_file.exists(): - download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/Sheldon%20Cooper%20Jim%20Parsons%20at%20Intels%20Lab.mp4", - filename=output_file.name, - ) - - -.. parsed-literal:: - - 'downloaded_video.mp4' already exists. - - - - -.. parsed-literal:: - - PosixPath('/home/labuser/work/notebook/openvino_notebooks/notebooks/whisper-subtitles-generation/downloaded_video.mp4') - - - -Select the task for the model: - -- **transcribe** - generate audio transcription in the source language - (automatically detected). -- **translate** - generate audio transcription with translation to - English language. - -.. code:: ipython3 - - task = widgets.Select( - options=["transcribe", "translate"], - value="translate", - description="Select task:", - disabled=False, - ) - task - - - - -.. parsed-literal:: - - Select(description='Select task:', index=1, options=('transcribe', 'translate'), value='translate') - - - -.. code:: ipython3 - - try: - from moviepy import VideoFileClip - except ImportError: - from moviepy.editor import VideoFileClip - from transformers.pipelines.audio_utils import ffmpeg_read - - - def get_audio(video_file): - """ - Extract audio signal from a given video file, then convert it to float, - then mono-channel format and resample it to the expected sample rate - - Parameters: - video_file: path to input video file - Returns: - resampled_audio: mono-channel float audio signal with 16000 Hz sample rate - extracted from video - duration: duration of video fragment in seconds - """ - input_video = VideoFileClip(str(video_file)) - duration = input_video.duration - audio_file = video_file.stem + ".wav" - input_video.audio.write_audiofile(audio_file, logger=None) - with open(audio_file, "rb") as f: - inputs = f.read() - audio = ffmpeg_read(inputs, 16000) - return { - "raw": audio, - "sampling_rate": 16000, - }, duration - -Let’s run generation method. We will put input data as ``np array``. -Also we will specify ``task`` and ``return_timestamps=True`` options. If -task is ``translate``, you can place ``language`` option, for example -``<|fr|>`` for French or it would be detect automatically. We can set up -generation parameters in different ways. We can get default config with -``get_generation_config()``, setup parameters and put config directly to -``generate()``. It’s also possible to specify the needed options just as -inputs in the ``generate()`` method and we will use this way. Then we -just run ``generate`` method and get the output in text format. - -``generate`` method with ``return_timestamps`` set to ``True`` will -return ``chunks``, which contain attributes: ``text``, ``start_ts`` and -``end_ts`` - -.. code:: ipython3 - - inputs, duration = get_audio(output_file) - - transcription = ov_pipe.generate(inputs["raw"], task=task.value, return_timestamps=True).chunks - -.. code:: ipython3 - - import math - - - def format_timestamp(seconds: float): - """ - format time in srt-file expected format - """ - assert seconds >= 0, "non-negative timestamp expected" - milliseconds = round(seconds * 1000.0) - - hours = milliseconds // 3_600_000 - milliseconds -= hours * 3_600_000 - - minutes = milliseconds // 60_000 - milliseconds -= minutes * 60_000 - - seconds = milliseconds // 1_000 - milliseconds -= seconds * 1_000 - - return (f"{hours}:" if hours > 0 else "00:") + f"{minutes:02d}:{seconds:02d},{milliseconds:03d}" - - - def prepare_srt(transcription, filter_duration=None): - """ - Format transcription into srt file format - """ - segment_lines = [] - for idx, segment in enumerate(transcription): - timestamp = (segment.start_ts, segment.end_ts) - # for the case where the model could not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. - if segment.end_ts == -1: - timestamp[1] = filter_duration - - if filter_duration is not None and (timestamp[0] >= math.floor(filter_duration) or timestamp[1] > math.ceil(filter_duration) + 1): - break - segment_lines.append(str(idx + 1) + "\n") - time_start = format_timestamp(timestamp[0]) - time_end = format_timestamp(timestamp[1]) - time_str = f"{time_start} --> {time_end}\n" - segment_lines.append(time_str) - segment_lines.append(segment.text + "\n\n") - return segment_lines - -"The results will be saved in the ``downloaded_video.srt`` file. SRT is -one of the most popular formats for storing subtitles and is compatible -with many modern video players. This file can be used to embed -transcription into videos during playback or by injecting them directly -into video files using ``ffmpeg``. - -.. code:: ipython3 - - srt_lines = prepare_srt(transcription, filter_duration=duration) - # save transcription - with output_file.with_suffix(".srt").open("w") as f: - f.writelines(srt_lines) - -Now let us see the results. - -.. code:: ipython3 - - widgets.Video.from_file(output_file, loop=False, width=800, height=800) - - - - -.. parsed-literal:: - - Video(value=b'\x00\x00\x00\x18ftypmp42\x00\x00\x00\x00isommp42\x00\x00Aimoov\x00\x00\x00lmvhd...', height='800… - - - -.. code:: ipython3 - - print("".join(srt_lines)) - - -.. parsed-literal:: - - 1 - 00:00:00,000 --> 00:00:05,000 - Oh, what's that? - - 2 - 00:00:05,000 --> 00:00:08,000 - Oh, wow. - - 3 - 00:00:08,000 --> 00:00:10,000 - Hello, humans. - - 4 - 00:00:13,000 --> 00:00:15,000 - Focus on me. - - 5 - 00:00:15,000 --> 00:00:17,000 - Focus on the guard. - - 6 - 00:00:17,000 --> 00:00:20,000 - Don't tell anyone what you're seeing in here. - - 7 - 00:00:22,000 --> 00:00:24,000 - Have you seen what's in there? - - 8 - 00:00:24,000 --> 00:00:25,000 - They have intel. - - 9 - 00:00:25,000 --> 00:00:27,000 - This is where it all changes. - - - - -Quantization ------------- - - - -`NNCF `__ enables -post-training quantization by adding the quantization layers into the -model graph and then using a subset of the training dataset to -initialize the parameters of these additional quantization layers. The -framework is designed so that modifications to your original training -code are minor. - -The optimization process contains the following steps: - -1. Create a calibration dataset for quantization. -2. Run ``nncf.quantize`` to obtain quantized encoder and decoder models. -3. Serialize the ``INT8`` model using ``openvino.save_model`` function. - -.. - - **Note**: Quantization is time and memory consuming operation. - Running quantization code below may take some time. - -Please select below whether you would like to run Whisper quantization. - -.. code:: ipython3 - - to_quantize = widgets.Checkbox( - value=True, - description="Quantization", - disabled=False, - ) - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -.. code:: ipython3 - - # Fetch `skip_kernel_extension` module - import requests - - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - ov_quantized_model = None - quantized_ov_pipe = None - - %load_ext skip_kernel_extension - -Let’s load converted OpenVINO model format using Optimum-Intel to easily -quantize it. - -Optimum Intel can be used to load optimized models from the `Hugging -Face Hub `__ or -local folder to create pipelines to run an inference with OpenVINO -Runtime using Hugging Face APIs. The Optimum Inference models are API -compatible with Hugging Face Transformers models. This means we just -need to replace the ``AutoModelForXxx`` class with the corresponding -``OVModelForXxx`` class. - -Below is an example of the whisper-tiny model - -.. code:: diff - - -from transformers import AutoModelForSpeechSeq2Seq - +from optimum.intel.openvino import OVModelForSpeechSeq2Seq - from transformers import AutoTokenizer, pipeline - - model_id = "openai/whisper-tiny" - -model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id) - +model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True) - -Like the original PyTorch model, the OpenVINO model is also compatible -with HuggingFace -`pipeline `__ -interface for ``automatic-speech-recognition``. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from transformers import AutoProcessor - from optimum.intel.openvino import OVModelForSpeechSeq2Seq - - ov_model = OVModelForSpeechSeq2Seq.from_pretrained(model_dir, device=device.value) - processor = AutoProcessor.from_pretrained(model_dir) - -Prepare calibration datasets -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -First step is to prepare calibration datasets for quantization. Since we -quantize whisper encoder and decoder separately, we need to prepare a -calibration dataset for each of the models. We import an -``InferRequestWrapper`` class that will intercept model inputs and -collect them to a list. Then we run model inference on some small amount -of audio samples. Generally, increasing the calibration dataset size -improves quantization quality. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from itertools import islice - from tqdm.notebook import tqdm - from datasets import load_dataset - from transformers import pipeline - from optimum.intel.openvino.quantization import InferRequestWrapper - - - def collect_calibration_dataset(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int): - # Overwrite model request properties, saving the original ones for restoring later - encoder_calibration_data = [] - decoder_calibration_data = [] - ov_model.encoder.request = InferRequestWrapper(ov_model.encoder.request, encoder_calibration_data, apply_caching=True) - ov_model.decoder.request = InferRequestWrapper(ov_model.decoder.request, - decoder_calibration_data, - apply_caching=True) - - pipe = pipeline( - "automatic-speech-recognition", - model=ov_model, - chunk_length_s=30, - tokenizer=processor.tokenizer, - feature_extractor=processor.feature_extractor, devide=torch.device("cpu")) - try: - calibration_dataset = dataset = load_dataset("openslr/librispeech_asr", "clean", split="validation", streaming=True, trust_remote_code=True) - for sample in tqdm(islice(calibration_dataset, calibration_dataset_size), desc="Collecting calibration data", - total=calibration_dataset_size): - pipe(sample["audio"], generate_kwargs={"task": task.value}, return_timestamps=True) - finally: - ov_model.encoder.request = ov_model.encoder.request.request - ov_model.decoder.request = ov_model.decoder.request.request - - return encoder_calibration_data, decoder_calibration_data - -Quantize Whisper encoder and decoder models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Below we run the ``quantize`` function which calls ``nncf.quantize`` on -Whisper encoder and decoder-with-past models. We don’t quantize -first-step-decoder because its share in whole inference time is -negligible. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import gc - import shutil - import nncf - import openvino as ov - - - CALIBRATION_DATASET_SIZE = 30 - quantized_model_path = Path(f"{model_dir}_quantized") - - - def quantize(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int): - if not quantized_model_path.exists(): - encoder_calibration_data, decoder_calibration_data = collect_calibration_dataset(ov_model, calibration_dataset_size) - print("Quantizing encoder") - quantized_encoder = nncf.quantize( - ov_model.encoder.model, - nncf.Dataset(encoder_calibration_data), - subset_size=len(encoder_calibration_data), - model_type=nncf.ModelType.TRANSFORMER, - # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search - advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.80), - ) - ov.save_model(quantized_encoder, quantized_model_path / "openvino_encoder_model.xml") - del quantized_encoder - del encoder_calibration_data - gc.collect() - - print("Quantizing decoder") - quantized_decoder = nncf.quantize( - ov_model.decoder.model, - nncf.Dataset(decoder_calibration_data), - subset_size=len(decoder_calibration_data), - model_type=nncf.ModelType.TRANSFORMER, - # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search - advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.96), - ) - ov.save_model(quantized_decoder, quantized_model_path / "openvino_decoder_model.xml") - del quantized_decoder - del decoder_calibration_data - gc.collect() - - # Copy the config file and the first-step-decoder manually - model_path = Path(model_dir) - shutil.copy(model_path / "config.json", quantized_model_path / "config.json") - shutil.copy(model_path / "generation_config.json", quantized_model_path / "generation_config.json") - shutil.copy(model_path / "openvino_decoder_model.xml", quantized_model_path / "openvino_decoder_model.xml") - shutil.copy(model_path / "openvino_decoder_model.bin", quantized_model_path / "openvino_decoder_model.bin") - shutil.copy(model_path / "openvino_tokenizer.xml", quantized_model_path / "openvino_tokenizer.xml") - shutil.copy(model_path / "openvino_tokenizer.bin", quantized_model_path / "openvino_tokenizer.bin") - shutil.copy(model_path / "openvino_detokenizer.xml", quantized_model_path / "openvino_detokenizer.xml") - shutil.copy(model_path / "openvino_detokenizer.bin", quantized_model_path / "openvino_detokenizer.bin") - shutil.copy(model_path / "tokenizer_config.json", quantized_model_path / "tokenizer_config.json") - shutil.copy(model_path / "tokenizer.json", quantized_model_path / "tokenizer.json") - shutil.copy(model_path / "vocab.json", quantized_model_path / "vocab.json") - shutil.copy(model_path / "preprocessor_config.json", quantized_model_path / "preprocessor_config.json") - shutil.copy(model_path / "special_tokens_map.json", quantized_model_path / "special_tokens_map.json") - shutil.copy(model_path / "normalizer.json", quantized_model_path / "normalizer.json") - shutil.copy(model_path / "merges.txt", quantized_model_path / "merges.txt") - shutil.copy(model_path / "added_tokens.json", quantized_model_path / "added_tokens.json") - - quantized_ov_pipe = ov_genai.WhisperPipeline(str(quantized_model_path), device=device.value) - return quantized_ov_pipe - - - quantized_ov_pipe = quantize(ov_model, CALIBRATION_DATASET_SIZE) - -Run quantized model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Let’s compare the transcription results for original and quantized -models. - -.. code:: ipython3 - - if ov_quantized_model is not None: - inputs, duration = get_audio(output_file) - transcription = quantized_ov_pipe.generate(inputs["raw"], task=task.value, return_timestamps=True).chunks - srt_lines = prepare_srt(transcription, filter_duration=duration) - print("".join(srt_lines)) - widgets.Video.from_file(output_file, loop=False, width=800, height=800) - -Compare performance and accuracy of the original and quantized models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Finally, we compare original and quantized Whisper models from accuracy -and performance stand-points. - -To measure accuracy, we use ``1 - WER`` as a metric, where WER stands -for Word Error Rate. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import time - from contextlib import contextmanager - from jiwer import wer, wer_standardize - - TEST_DATASET_SIZE = 50 - - def calculate_transcription_time_and_accuracy(ov_model, test_samples): - whole_infer_times = [] - - ground_truths = [] - predictions = [] - for data_item in tqdm(test_samples, desc="Measuring performance and accuracy"): - start_time = time.perf_counter() - transcription = ov_model.generate(data_item["audio"]["array"], return_timestamps=True) - end_time = time.perf_counter() - whole_infer_times.append(end_time - start_time) - - ground_truths.append(data_item["text"]) - predictions.append(transcription.texts[0]) - - word_accuracy = (1 - wer(ground_truths, predictions, reference_transform=wer_standardize, - hypothesis_transform=wer_standardize)) * 100 - mean_whole_infer_time = sum(whole_infer_times) - return word_accuracy, mean_whole_infer_time - - test_dataset = load_dataset("openslr/librispeech_asr", "clean", split="validation", streaming=True, trust_remote_code=True) - test_dataset = test_dataset.shuffle(seed=42).take(TEST_DATASET_SIZE) - test_samples = [sample for sample in test_dataset] - - accuracy_original, times_original = calculate_transcription_time_and_accuracy(ov_pipe, test_samples) - accuracy_quantized, times_quantized = calculate_transcription_time_and_accuracy(quantized_ov_pipe, test_samples) - print(f"Whole pipeline performance speedup: {times_original / times_quantized:.3f}") - print(f"Whisper transcription word accuracy. Original model: {accuracy_original:.2f}%. Quantized model: {accuracy_quantized:.2f}%.") - print(f"Accuracy drop: {accuracy_original - accuracy_quantized:.2f}%.") - - - -.. parsed-literal:: - - Measuring performance and accuracy: 0%| | 0/50 [00:00 30: - config = ov_pipe.get_generation_config() - chink_num = math.ceil(frame_num / 30) - config.max_length = chink_num * def_config.max_length - m_pipe.set_generation_config(config) - - transcription = m_pipe.generate(inputs["raw"], task=task.lower(), return_timestamps=True).chunks - srt_lines = prepare_srt(transcription, duration) - with data_path.with_suffix(".srt").open("w") as f: - f.writelines(srt_lines) - return [str(data_path), str(data_path.with_suffix(".srt"))] - - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/whisper-subtitles-generation/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=transcribe, quantized=ov_quantized_model is not None, sample_path=output_file) - - try: - demo.launch(debug=False) - except Exception: - demo.launch(share=True, debug=False) - # if you are launching remotely, specify server_name and server_port - # demo.launch(server_name='your server name', server_port='server port in int') - # Read more in the docs: https://gradio.app/docs/ diff --git a/docs/notebooks/yolov10-optimization-with-output.rst b/docs/notebooks/yolov10-optimization-with-output.rst deleted file mode 100644 index 8c2e6dc4495e65..00000000000000 --- a/docs/notebooks/yolov10-optimization-with-output.rst +++ /dev/null @@ -1,1115 +0,0 @@ -Convert and Optimize YOLOv10 with OpenVINO -========================================== - -Real-time object detection aims to accurately predict object categories -and positions in images with low latency. The YOLO series has been at -the forefront of this research due to its balance between performance -and efficiency. However, reliance on NMS and architectural -inefficiencies have hindered optimal performance. YOLOv10 addresses -these issues by introducing consistent dual assignments for NMS-free -training and a holistic efficiency-accuracy driven model design -strategy. - -YOLOv10, built on the `Ultralytics Python -package `__ by researchers at -`Tsinghua University `__, introduces a -new approach to real-time object detection, addressing both the -post-processing and model architecture deficiencies found in previous -YOLO versions. By eliminating non-maximum suppression (NMS) and -optimizing various model components, YOLOv10 achieves state-of-the-art -performance with significantly reduced computational overhead. Extensive -experiments demonstrate its superior accuracy-latency trade-offs across -multiple model scales. - -.. figure:: https://github.com/ultralytics/ultralytics/assets/26833433/f9b1bec0-928e-41ce-a205-e12db3c4929a - :alt: yolov10-approach.png - - yolov10-approach.png - -More details about model architecture you can find in original -`repo `__, -`paper `__ and `Ultralytics -documentation `__. - -This tutorial demonstrates step-by-step instructions on how to run and -optimize PyTorch YOLO V10 with OpenVINO. - -The tutorial consists of the following steps: - -- Prepare PyTorch model -- Convert PyTorch model to OpenVINO IR -- Run model inference with OpenVINO -- Prepare and run optimization pipeline using NNCF -- Compare performance of the FP16 and quantized models. -- Run optimized model inference on video -- Launch interactive Gradio demo - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Download PyTorch model <#download-pytorch-model>`__ -- `Export PyTorch model to OpenVINO IR - Format <#export-pytorch-model-to-openvino-ir-format>`__ -- `Run OpenVINO Inference on AUTO device using Ultralytics - API <#run-openvino-inference-on-auto-device-using-ultralytics-api>`__ -- `Run OpenVINO Inference on selected device using Ultralytics - API <#run-openvino-inference-on-selected-device-using-ultralytics-api>`__ -- `Optimize model using NNCF Post-training Quantization - API <#optimize-model-using-nncf-post-training-quantization-api>`__ - - - `Prepare Quantization Dataset <#prepare-quantization-dataset>`__ - - `Quantize and Save INT8 model <#quantize-and-save-int8-model>`__ - -- `Run Optimized Model Inference <#run-optimized-model-inference>`__ - - - `Run Optimized Model on AUTO - device <#run-optimized-model-on-auto-device>`__ - - `Run Optimized Model Inference on selected - device <#run-optimized-model-inference-on-selected-device>`__ - -- `Compare the Original and Quantized - Models <#compare-the-original-and-quantized-models>`__ - - - `Model size <#model-size>`__ - - `Performance <#performance>`__ - - `FP16 model performance <#fp16-model-performance>`__ - - `Int8 model performance <#int8-model-performance>`__ - -- `Live demo <#live-demo>`__ - - - `Gradio Interactive Demo <#gradio-interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - import os - - os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - - %pip install -q "nncf>=2.11.0" - %pip install -Uq "openvino>=2024.3.0" - %pip install -q "git+https://github.com/THU-MIG/yolov10.git" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q "torch>=2.1,<2.6" "torchvision>=0.16" tqdm opencv-python "gradio>=4.19" "matplotlib>=3.9" --extra-index-url https://download.pytorch.org/whl/cpu - -.. code:: ipython3 - - from pathlib import Path - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import download_file, VideoPlayer, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("yolov10-optimization.ipynb") - -Download PyTorch model ----------------------- - - - -There are several version of `YOLO -V10 `__ -models provided by model authors. Each of them has different -characteristics depends on number of training parameters, performance -and accuracy. For demonstration purposes we will use ``yolov10n``, but -the same steps are also applicable to other models in YOLO V10 series. - -.. code:: ipython3 - - models_dir = Path("./models") - models_dir.mkdir(exist_ok=True) - -.. code:: ipython3 - - model_weights_url = "https://github.com/jameslahm/yolov10/releases/download/v1.0/yolov10n.pt" - file_name = model_weights_url.split("/")[-1] - model_name = file_name.replace(".pt", "") - - if not (models_dir / file_name).exists(): - download_file(model_weights_url, directory=models_dir) - - -.. parsed-literal:: - - 'models/yolov10n.pt' already exists. - - - - -.. parsed-literal:: - - PosixPath('/home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/yolov10-optimization/models/yolov10n.pt') - - - -Export PyTorch model to OpenVINO IR Format ------------------------------------------- - - - -As it was discussed before, YOLO V10 code is designed on top of -`Ultralytics `__ library and has similar -interface with YOLO V8 (You can check `YOLO V8 -notebooks `__ -for more detailed instruction how to work with Ultralytics API). -Ultralytics support OpenVINO model export using -`export `__ method of model -class. Additionally, we can specify parameters responsible for target -input size, static or dynamic input shapes and model precision -(FP32/FP16/INT8). INT8 quantization can be additionally performed on -export stage, but for making approach more flexible, we consider how to -perform quantization using -`NNCF `__. - -.. code:: ipython3 - - import types - from ultralytics.utils import ops, yaml_load, yaml_save - from ultralytics import YOLOv10 - import torch - - detection_labels = { - 0: "person", - 1: "bicycle", - 2: "car", - 3: "motorcycle", - 4: "airplane", - 5: "bus", - 6: "train", - 7: "truck", - 8: "boat", - 9: "traffic light", - 10: "fire hydrant", - 11: "stop sign", - 12: "parking meter", - 13: "bench", - 14: "bird", - 15: "cat", - 16: "dog", - 17: "horse", - 18: "sheep", - 19: "cow", - 20: "elephant", - 21: "bear", - 22: "zebra", - 23: "giraffe", - 24: "backpack", - 25: "umbrella", - 26: "handbag", - 27: "tie", - 28: "suitcase", - 29: "frisbee", - 30: "skis", - 31: "snowboard", - 32: "sports ball", - 33: "kite", - 34: "baseball bat", - 35: "baseball glove", - 36: "skateboard", - 37: "surfboard", - 38: "tennis racket", - 39: "bottle", - 40: "wine glass", - 41: "cup", - 42: "fork", - 43: "knife", - 44: "spoon", - 45: "bowl", - 46: "banana", - 47: "apple", - 48: "sandwich", - 49: "orange", - 50: "broccoli", - 51: "carrot", - 52: "hot dog", - 53: "pizza", - 54: "donut", - 55: "cake", - 56: "chair", - 57: "couch", - 58: "potted plant", - 59: "bed", - 60: "dining table", - 61: "toilet", - 62: "tv", - 63: "laptop", - 64: "mouse", - 65: "remote", - 66: "keyboard", - 67: "cell phone", - 68: "microwave", - 69: "oven", - 70: "toaster", - 71: "sink", - 72: "refrigerator", - 73: "book", - 74: "clock", - 75: "vase", - 76: "scissors", - 77: "teddy bear", - 78: "hair drier", - 79: "toothbrush", - } - - - def v10_det_head_forward(self, x): - one2one = self.forward_feat([xi.detach() for xi in x], self.one2one_cv2, self.one2one_cv3) - if not self.export: - one2many = super().forward(x) - - if not self.training: - one2one = self.inference(one2one) - if not self.export: - return {"one2many": one2many, "one2one": one2one} - else: - assert self.max_det != -1 - boxes, scores, labels = ops.v10postprocess(one2one.permute(0, 2, 1), self.max_det, self.nc) - return torch.cat( - [boxes, scores.unsqueeze(-1), labels.unsqueeze(-1).to(boxes.dtype)], - dim=-1, - ) - else: - return {"one2many": one2many, "one2one": one2one} - - - ov_model_path = models_dir / f"{model_name}_openvino_model/{model_name}.xml" - if not ov_model_path.exists(): - model = YOLOv10(models_dir / file_name) - model.model.model[-1].forward = types.MethodType(v10_det_head_forward, model.model.model[-1]) - model.export(format="openvino", dynamic=True, half=True) - config = yaml_load(ov_model_path.parent / "metadata.yaml") - config["names"] = detection_labels - yaml_save(ov_model_path.parent / "metadata.yaml", config) - -Run OpenVINO Inference on AUTO device using Ultralytics API ------------------------------------------------------------ - - - -Now, when we exported model to OpenVINO, we can load it directly into -YOLOv10 class, where automatic inference backend will provide -easy-to-use user experience to run OpenVINO YOLOv10 model on the similar -level like for original PyTorch model. The code bellow demonstrates how -to run inference OpenVINO exported model with Ultralytics API on single -image. `AUTO -device `__ -will be used for launching model. - -.. code:: ipython3 - - ov_yolo_model = YOLOv10(ov_model_path.parent, task="detect") - -.. code:: ipython3 - - from PIL import Image - - IMAGE_PATH = Path("./data/coco_bike.jpg") - if not IMAGE_PATH.exists(): - download_file( - url="https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bike.jpg", - filename=IMAGE_PATH.name, - directory=IMAGE_PATH.parent, - ) - - -.. parsed-literal:: - - 'data/coco_bike.jpg' already exists. - - - - -.. parsed-literal:: - - PosixPath('/home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/yolov10-optimization/data/coco_bike.jpg') - - - -.. code:: ipython3 - - res = ov_yolo_model(IMAGE_PATH, iou=0.45, conf=0.2) - Image.fromarray(res[0].plot()[:, :, ::-1]) - - -.. parsed-literal:: - - Loading models/yolov10n_openvino_model for OpenVINO inference... - requirements: Ultralytics requirement ['openvino>=2024.0.0'] not found, attempting AutoUpdate... - requirements: ❌ AutoUpdate skipped (offline) - Using OpenVINO LATENCY mode for batch=1 inference... - - image 1/1 /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/yolov10-optimization/data/coco_bike.jpg: 640x640 1 bicycle, 2 cars, 1 motorcycle, 1 dog, 72.0ms - Speed: 25.6ms preprocess, 72.0ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 640) - - - - -.. image:: yolov10-optimization-with-output_files/yolov10-optimization-with-output_13_1.png - - - -Run OpenVINO Inference on selected device using Ultralytics API ---------------------------------------------------------------- - - - -In this part of notebook you can select inference device for running -model inference to compare results with AUTO. - -.. code:: ipython3 - - device = device_widget("CPU") - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='CPU') - - - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - - ov_model = core.read_model(ov_model_path) - - # load model on selected device - if "GPU" in device.value or "NPU" in device.value: - ov_model.reshape({0: [1, 3, 640, 640]}) - ov_config = {} - if "GPU" in device.value: - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - det_compiled_model = core.compile_model(ov_model, device.value, ov_config) - -.. code:: ipython3 - - ov_yolo_model.predictor.model.ov_compiled_model = det_compiled_model - -.. code:: ipython3 - - res = ov_yolo_model(IMAGE_PATH, iou=0.45, conf=0.2) - - -.. parsed-literal:: - - - image 1/1 /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/yolov10-optimization/data/coco_bike.jpg: 640x640 1 bicycle, 2 cars, 1 motorcycle, 1 dog, 29.1ms - Speed: 3.2ms preprocess, 29.1ms inference, 0.3ms postprocess per image at shape (1, 3, 640, 640) - - -.. code:: ipython3 - - Image.fromarray(res[0].plot()[:, :, ::-1]) - - - - -.. image:: yolov10-optimization-with-output_files/yolov10-optimization-with-output_19_0.png - - - -Optimize model using NNCF Post-training Quantization API --------------------------------------------------------- - - - -`NNCF `__ provides a suite of -advanced algorithms for Neural Networks inference optimization in -OpenVINO with minimal accuracy drop. We will use 8-bit quantization in -post-training mode (without the fine-tuning pipeline) to optimize -YOLOv10. - -The optimization process contains the following steps: - -1. Create a Dataset for quantization. -2. Run ``nncf.quantize`` for getting an optimized model. -3. Serialize OpenVINO IR model, using the ``openvino.save_model`` - function. - -Quantization is time and memory consuming process, you can skip this -step using checkbox bellow: - -.. code:: ipython3 - - import ipywidgets as widgets - - int8_model_det_path = models_dir / "int8" / f"{model_name}_openvino_model/{model_name}.xml" - ov_yolo_int8_model = None - - to_quantize = widgets.Checkbox( - value=True, - description="Quantization", - disabled=False, - ) - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -.. code:: ipython3 - - # Fetch skip_kernel_extension module - - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - %load_ext skip_kernel_extension - -Prepare Quantization Dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -For starting quantization, we need to prepare dataset. We will use -validation subset from `MS COCO dataset `__ -for model quantization and Ultralytics validation data loader for -preparing input data. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from zipfile import ZipFile - - from ultralytics.data.utils import DATASETS_DIR - - if not int8_model_det_path.exists(): - - DATA_URL = "http://images.cocodataset.org/zips/val2017.zip" - LABELS_URL = "https://github.com/ultralytics/yolov5/releases/download/v1.0/coco2017labels-segments.zip" - CFG_URL = "https://raw.githubusercontent.com/ultralytics/ultralytics/v8.1.0/ultralytics/cfg/datasets/coco.yaml" - - OUT_DIR = DATASETS_DIR - - DATA_PATH = OUT_DIR / "val2017.zip" - LABELS_PATH = OUT_DIR / "coco2017labels-segments.zip" - CFG_PATH = OUT_DIR / "coco.yaml" - - if not (OUT_DIR / "coco/labels").exists(): - download_file(DATA_URL, DATA_PATH.name, DATA_PATH.parent) - download_file(LABELS_URL, LABELS_PATH.name, LABELS_PATH.parent) - download_file(CFG_URL, CFG_PATH.name, CFG_PATH.parent) - with ZipFile(LABELS_PATH, "r") as zip_ref: - zip_ref.extractall(OUT_DIR) - with ZipFile(DATA_PATH, "r") as zip_ref: - zip_ref.extractall(OUT_DIR / "coco/images") - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from ultralytics.utils import DEFAULT_CFG - from ultralytics.cfg import get_cfg - from ultralytics.data.converter import coco80_to_coco91_class - from ultralytics.data.utils import check_det_dataset - - if not int8_model_det_path.exists(): - args = get_cfg(cfg=DEFAULT_CFG) - args.data = str(CFG_PATH) - det_validator = ov_yolo_model.task_map[ov_yolo_model.task]["validator"](args=args) - - det_validator.data = check_det_dataset(args.data) - det_validator.stride = 32 - det_data_loader = det_validator.get_dataloader(OUT_DIR / "coco", 1) - -NNCF provides ``nncf.Dataset`` wrapper for using native framework -dataloaders in quantization pipeline. Additionally, we specify transform -function that will be responsible for preparing input data in model -expected format. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import nncf - from typing import Dict - - - def transform_fn(data_item:Dict): - """ - Quantization transform function. Extracts and preprocess input data from dataloader item for quantization. - Parameters: - data_item: Dict with data item produced by DataLoader during iteration - Returns: - input_tensor: Input data for quantization - """ - input_tensor = det_validator.preprocess(data_item)['img'].numpy() - return input_tensor - - if not int8_model_det_path.exists(): - quantization_dataset = nncf.Dataset(det_data_loader, transform_fn) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, openvino - - -Quantize and Save INT8 model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The ``nncf.quantize`` function provides an interface for model -quantization. It requires an instance of the OpenVINO Model and -quantization dataset. Optionally, some additional parameters for the -configuration quantization process (number of samples for quantization, -preset, ignored scope, etc.) can be provided. YOLOv10 model contains -non-ReLU activation functions, which require asymmetric quantization of -activations. To achieve a better result, we will use a ``mixed`` -quantization preset. It provides symmetric quantization of weights and -asymmetric quantization of activations. - - **Note**: Model post-training quantization is time-consuming process. - Be patient, it can take several minutes depending on your hardware. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import shutil - - if not int8_model_det_path.exists(): - quantized_det_model = nncf.quantize( - ov_model, - quantization_dataset, - preset=nncf.QuantizationPreset.MIXED, - ) - - ov.save_model(quantized_det_model, int8_model_det_path) - shutil.copy(ov_model_path.parent / "metadata.yaml", int8_model_det_path.parent / "metadata.yaml") - -Run Optimized Model Inference ------------------------------ - - - -The way of usage INT8 quantized model is the same like for model before -quantization. Let’s check inference result of quantized model on single -image - -Run Optimized Model on AUTO device -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - ov_yolo_int8_model = YOLOv10(int8_model_det_path.parent, task="detect") - -.. code:: ipython3 - - %%skip not $to_quantize.value - res = ov_yolo_int8_model(IMAGE_PATH, iou=0.45, conf=0.2) - - -.. parsed-literal:: - - Loading models/int8/yolov10n_openvino_model for OpenVINO inference... - requirements: Ultralytics requirement ['openvino>=2024.0.0'] not found, attempting AutoUpdate... - requirements: ❌ AutoUpdate skipped (offline) - Using OpenVINO LATENCY mode for batch=1 inference... - - image 1/1 /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/yolov10-optimization/data/coco_bike.jpg: 640x640 1 bicycle, 3 cars, 2 motorcycles, 1 dog, 92.3ms - Speed: 3.7ms preprocess, 92.3ms inference, 0.4ms postprocess per image at shape (1, 3, 640, 640) - - -.. code:: ipython3 - - Image.fromarray(res[0].plot()[:, :, ::-1]) - - - - -.. image:: yolov10-optimization-with-output_files/yolov10-optimization-with-output_34_0.png - - - -Run Optimized Model Inference on selected device -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - device - -.. code:: ipython3 - - %%skip not $to_quantize.value - - ov_config = {} - if "GPU" in device.value or "NPU" in device.value: - ov_model.reshape({0: [1, 3, 640, 640]}) - ov_config = {} - if "GPU" in device.value: - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - - quantized_det_model = core.read_model(int8_model_det_path) - quantized_det_compiled_model = core.compile_model(quantized_det_model, device.value, ov_config) - - ov_yolo_int8_model.predictor.model.ov_compiled_model = quantized_det_compiled_model - - res = ov_yolo_int8_model(IMAGE_PATH, iou=0.45, conf=0.2) - - -.. parsed-literal:: - - - image 1/1 /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/yolov10-optimization/data/coco_bike.jpg: 640x640 1 bicycle, 3 cars, 2 motorcycles, 1 dog, 26.5ms - Speed: 7.4ms preprocess, 26.5ms inference, 0.3ms postprocess per image at shape (1, 3, 640, 640) - - -.. code:: ipython3 - - Image.fromarray(res[0].plot()[:, :, ::-1]) - - - - -.. image:: yolov10-optimization-with-output_files/yolov10-optimization-with-output_38_0.png - - - -Compare the Original and Quantized Models ------------------------------------------ - - - -Model size -~~~~~~~~~~ - - - -.. code:: ipython3 - - ov_model_weights = ov_model_path.with_suffix(".bin") - print(f"Size of FP16 model is {ov_model_weights.stat().st_size / 1024 / 1024:.2f} MB") - if int8_model_det_path.exists(): - ov_int8_weights = int8_model_det_path.with_suffix(".bin") - print(f"Size of model with INT8 compressed weights is {ov_int8_weights.stat().st_size / 1024 / 1024:.2f} MB") - print(f"Compression rate for INT8 model: {ov_model_weights.stat().st_size / ov_int8_weights.stat().st_size:.3f}") - - -.. parsed-literal:: - - Size of FP16 model is 4.39 MB - Size of model with INT8 compressed weights is 2.25 MB - Compression rate for INT8 model: 1.954 - - -Performance -~~~~~~~~~~~ - - - -FP16 model performance -~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - !benchmark_app -m $ov_model_path -d $device.value -api async -shape "[1,3,640,640]" -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.2.0-15496-17f8e86e5f2-releases/2024/2 - [ INFO ] - [ INFO ] Device info: - [ INFO ] CPU - [ INFO ] Build ................................. 2024.2.0-15496-17f8e86e5f2-releases/2024/2 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(CPU) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 31.92 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [?,3,?,?] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_8) : f32 / [...] / [?,300,6] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,640,640] - [ INFO ] Reshape model took 17.77 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_8) : f32 / [...] / [1,300,6] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 303.83 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] PERF_COUNT: NO - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 0 - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] AFFINITY: Affinity.CORE - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 15000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 30.60 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 2424 iterations - [ INFO ] Duration: 15093.22 ms - [ INFO ] Latency: - [ INFO ] Median: 72.34 ms - [ INFO ] Average: 74.46 ms - [ INFO ] Min: 45.87 ms - [ INFO ] Max: 147.25 ms - [ INFO ] Throughput: 160.60 FPS - - -Int8 model performance -~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - if int8_model_det_path.exists(): - !benchmark_app -m $int8_model_det_path -d $device.value -api async -shape "[1,3,640,640]" -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.2.0-15496-17f8e86e5f2-releases/2024/2 - [ INFO ] - [ INFO ] Device info: - [ INFO ] CPU - [ INFO ] Build ................................. 2024.2.0-15496-17f8e86e5f2-releases/2024/2 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(CPU) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 38.75 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [?,3,?,?] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_8) : f32 / [...] / [?,300,6] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,640,640] - [ INFO ] Reshape model took 18.33 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_8) : f32 / [...] / [1,300,6] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 622.99 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 18 - [ INFO ] NUM_STREAMS: 18 - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] PERF_COUNT: NO - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 0 - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] AFFINITY: Affinity.CORE - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 18 inference requests, limits: 15000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 28.26 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 5886 iterations - [ INFO ] Duration: 15067.10 ms - [ INFO ] Latency: - [ INFO ] Median: 44.39 ms - [ INFO ] Average: 45.89 ms - [ INFO ] Min: 29.73 ms - [ INFO ] Max: 110.52 ms - [ INFO ] Throughput: 390.65 FPS - - -Live demo ---------- - - - -The following code runs model inference on a video: - -.. code:: ipython3 - - import collections - import time - from IPython import display - import cv2 - import numpy as np - - - # Main processing function to run object detection. - def run_object_detection( - source=0, - flip=False, - use_popup=False, - skip_first_frames=0, - det_model=ov_yolo_int8_model, - device=device.value, - ): - player = None - try: - # Create a video player to play with target fps. - player = VideoPlayer(source=source, flip=flip, fps=30, skip_first_frames=skip_first_frames) - # Start capturing. - player.start() - if use_popup: - title = "Press ESC to Exit" - cv2.namedWindow(winname=title, flags=cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE) - - processing_times = collections.deque() - while True: - # Grab the frame. - frame = player.next() - if frame is None: - print("Source ended") - break - # If the frame is larger than full HD, reduce size to improve the performance. - scale = 1280 / max(frame.shape) - if scale < 1: - frame = cv2.resize( - src=frame, - dsize=None, - fx=scale, - fy=scale, - interpolation=cv2.INTER_AREA, - ) - # Get the results. - input_image = np.array(frame) - - start_time = time.time() - detections = det_model(input_image, iou=0.45, conf=0.2, verbose=False) - stop_time = time.time() - frame = detections[0].plot() - - processing_times.append(stop_time - start_time) - # Use processing times from last 200 frames. - if len(processing_times) > 200: - processing_times.popleft() - - _, f_width = frame.shape[:2] - # Mean processing time [ms]. - processing_time = np.mean(processing_times) * 1000 - fps = 1000 / processing_time - cv2.putText( - img=frame, - text=f"Inference time: {processing_time:.1f}ms ({fps:.1f} FPS)", - org=(20, 40), - fontFace=cv2.FONT_HERSHEY_COMPLEX, - fontScale=f_width / 1000, - color=(0, 0, 255), - thickness=1, - lineType=cv2.LINE_AA, - ) - # Use this workaround if there is flickering. - if use_popup: - cv2.imshow(winname=title, mat=frame) - key = cv2.waitKey(1) - # escape = 27 - if key == 27: - break - else: - # Encode numpy array to jpg. - _, encoded_img = cv2.imencode(ext=".jpg", img=frame, params=[cv2.IMWRITE_JPEG_QUALITY, 100]) - # Create an IPython image. - i = display.Image(data=encoded_img) - # Display the image in this notebook. - display.clear_output(wait=True) - display.display(i) - # ctrl-c - except KeyboardInterrupt: - print("Interrupted") - # any different error - except RuntimeError as e: - print(e) - finally: - if player is not None: - # Stop capturing. - player.stop() - if use_popup: - cv2.destroyAllWindows() - -.. code:: ipython3 - - use_int8 = widgets.Checkbox( - value=ov_yolo_int8_model is not None, - description="Use int8 model", - disabled=ov_yolo_int8_model is None, - ) - - use_int8 - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Use int8 model') - - - -.. code:: ipython3 - - WEBCAM_INFERENCE = False - - if WEBCAM_INFERENCE: - VIDEO_SOURCE = 0 # Webcam - else: - VIDEO_SOURCE = "data/people.mp4" - if not Path(VIDEO_SOURCE).exists(): - download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/people.mp4", - directory="data", - ) - - -.. parsed-literal:: - - 'data/people.mp4' already exists. - - -.. code:: ipython3 - - run_object_detection( - det_model=ov_yolo_model if not use_int8.value else ov_yolo_int8_model, - source=VIDEO_SOURCE, - flip=True, - use_popup=False, - ) - - - -.. image:: yolov10-optimization-with-output_files/yolov10-optimization-with-output_50_0.png - - -.. parsed-literal:: - - Source ended - - -Gradio Interactive Demo -~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - def yolov10_inference(image, int8, conf_threshold, iou_threshold): - model = ov_yolo_model if not int8 else ov_yolo_int8_model - results = model(source=image, iou=iou_threshold, conf=conf_threshold, verbose=False)[0] - annotated_image = Image.fromarray(results.plot()) - - return annotated_image - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/yolov10-optimization/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=yolov10_inference, quantized=ov_yolo_int8_model is not None) - - try: - demo.launch(debug=False) - except Exception: - demo.launch(debug=False, share=True) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ diff --git a/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_13_1.jpg b/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_13_1.jpg deleted file mode 100644 index 836cc32448048d..00000000000000 --- a/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_13_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7bcc16025e6a4d06e026695833651b08c7d22b25496e774c2ae8164261de6f8e -size 112625 diff --git a/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_13_1.png b/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_13_1.png deleted file mode 100644 index 251b5e4bd71e03..00000000000000 --- a/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_13_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c9740bd591e699546159b62b35630e7ac1ba9c7d960afea45c38645d33255e44 -size 902882 diff --git a/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_19_0.jpg b/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_19_0.jpg deleted file mode 100644 index 836cc32448048d..00000000000000 --- a/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_19_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7bcc16025e6a4d06e026695833651b08c7d22b25496e774c2ae8164261de6f8e -size 112625 diff --git a/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_19_0.png b/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_19_0.png deleted file mode 100644 index 251b5e4bd71e03..00000000000000 --- a/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_19_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c9740bd591e699546159b62b35630e7ac1ba9c7d960afea45c38645d33255e44 -size 902882 diff --git a/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_34_0.jpg b/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_34_0.jpg deleted file mode 100644 index 9c81531510d84e..00000000000000 --- a/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_34_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f3424f59026e5158d83a3a8545734bf5f068adf9c8dd45787d9296aa12767564 -size 112928 diff --git a/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_34_0.png b/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_34_0.png deleted file mode 100644 index 5bd43473badac0..00000000000000 --- a/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_34_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8698b6ae8aecdf4f02069438a78cb63757c21a90a4d01ef9a8f91dc55702c582 -size 901897 diff --git a/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_38_0.jpg b/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_38_0.jpg deleted file mode 100644 index 9c81531510d84e..00000000000000 --- a/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_38_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f3424f59026e5158d83a3a8545734bf5f068adf9c8dd45787d9296aa12767564 -size 112928 diff --git a/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_38_0.png b/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_38_0.png deleted file mode 100644 index 5bd43473badac0..00000000000000 --- a/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_38_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8698b6ae8aecdf4f02069438a78cb63757c21a90a4d01ef9a8f91dc55702c582 -size 901897 diff --git a/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_50_0.png b/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_50_0.png deleted file mode 100644 index 7c7cf1cdf7e121..00000000000000 --- a/docs/notebooks/yolov10-optimization-with-output_files/yolov10-optimization-with-output_50_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:239f9c39bcad7e6849d2f0f5d1264601f3e6542690b16d16a7bc0be1d12eb18e -size 528576 diff --git a/docs/notebooks/yolov11-instance-segmentation-with-output.rst b/docs/notebooks/yolov11-instance-segmentation-with-output.rst deleted file mode 100644 index 5af1d0c5cb54b0..00000000000000 --- a/docs/notebooks/yolov11-instance-segmentation-with-output.rst +++ /dev/null @@ -1,1009 +0,0 @@ -Convert and Optimize YOLOv11 instance segmentation model with OpenVINO™ -======================================================================= - -Instance segmentation goes a step further than object detection and -involves identifying individual objects in an image and segmenting them -from the rest of the image. Instance segmentation as an object detection -are often used as key components in computer vision systems. -Applications that use real-time instance segmentation models include -video analytics, robotics, autonomous vehicles, multi-object tracking -and object counting, medical image analysis, and many others. - -This tutorial demonstrates step-by-step instructions on how to run and -optimize PyTorch YOLOv11 with OpenVINO. We consider the steps required -for instance segmentation scenario. You can find more details about -model on `model page `__ in -Ultralytics documentation. - -The tutorial consists of the following steps: - Prepare the PyTorch -model. - Download and prepare a dataset. - Validate the original model. -- Convert the PyTorch model to OpenVINO IR. - Validate the converted -model. - Prepare and run optimization pipeline. - Compare performance of -the FP32 and quantized models. - Compare accuracy of the FP32 and -quantized models. - Live demo - - -**Table of contents:** - - -- `Get PyTorch model <#get-pytorch-model>`__ - - - `Prerequisites <#prerequisites>`__ - -- `Instantiate model <#instantiate-model>`__ - - - `Convert model to OpenVINO IR <#convert-model-to-openvino-ir>`__ - - `Verify model inference <#verify-model-inference>`__ - - `Select inference device <#select-inference-device>`__ - - `Test on single image <#test-on-single-image>`__ - -- `Optimize model using NNCF Post-training Quantization - API <#optimize-model-using-nncf-post-training-quantization-api>`__ - - - `Validate Quantized model - inference <#validate-quantized-model-inference>`__ - -- `Compare the Original and Quantized - Models <#compare-the-original-and-quantized-models>`__ - - - `Compare performance of the Original and Quantized - Models <#compare-performance-of-the-original-and-quantized-models>`__ - -- `Other ways to optimize model <#other-ways-to-optimize-model>`__ -- `Live demo <#live-demo>`__ - - - `Run Live Object Detection and - Segmentation <#run-live-object-detection-and-segmentation>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Get PyTorch model ------------------ - - - -Generally, PyTorch models represent an instance of the -`torch.nn.Module `__ -class, initialized by a state dictionary with model weights. We will use -the YOLOv11 nano model (also known as ``yolo11n-seg``) pre-trained on a -COCO dataset, which is available in this -`repo `__. Similar steps are -also applicable to other YOLOv11 models. Typical steps to obtain a -pre-trained model: 1. Create an instance of a model class. 2. Load a -checkpoint state dict, which contains the pre-trained model weights. 3. -Turn the model to evaluation for switching some operations to inference -mode. - -In this case, the creators of the model provide an API that enables -converting the YOLOv11 model to OpenVINO IR. Therefore, we do not need -to do these steps manually. - -Prerequisites -^^^^^^^^^^^^^ - - - -Install necessary packages. - -.. code:: ipython3 - - %pip install -q "openvino>=2024.0.0" "nncf>=2.9.0" - %pip install -q "torch>=2.1" "torchvision>=0.16" "ultralytics==8.3.59" opencv-python tqdm --extra-index-url https://download.pytorch.org/whl/cpu - -Import required utility functions. The lower cell will download the -``notebook_utils`` Python module from GitHub. - -.. code:: ipython3 - - from pathlib import Path - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - - from notebook_utils import download_file, VideoPlayer, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("yolov11-instance-segmentation.ipynb") - -.. code:: ipython3 - - # Download a test sample - IMAGE_PATH = Path("./data/coco_bike.jpg") - - if not IMAGE_PATH.exists(): - download_file( - url="https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bike.jpg", - filename=IMAGE_PATH.name, - directory=IMAGE_PATH.parent, - ) - - -.. parsed-literal:: - - 'data/coco_bike.jpg' already exists. - - - - -.. parsed-literal:: - - PosixPath('/home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/yolov11-optimization/data/coco_bike.jpg') - - - -Instantiate model ------------------ - - - -For loading the model, required to specify a path to the model -checkpoint. It can be some local path or name available on models hub -(in this case model checkpoint will be downloaded automatically). You -can select model using widget bellow: - -.. code:: ipython3 - - import ipywidgets as widgets - - model_id = [ - "yolo11n-seg", - "yolo11s-seg", - "yolo11m-seg", - "yolo11l-seg", - "yolo11x-seg", - "yolov8n-seg", - "yolov8s-seg", - "yolov8m-seg", - "yolov8l-seg", - "yolov8x-seg", - ] - - model_name = widgets.Dropdown(options=model_id, value=model_id[0], description="Model") - - model_name - - - - -.. parsed-literal:: - - Dropdown(description='Model', options=('yolo11n-seg', 'yolo11s-seg', 'yolo11m-seg', 'yolo11l-seg', 'yolo11x-se… - - - -Making prediction, the model accepts a path to input image and returns -list with Results class object. Results contains boxes for object -detection model and boxes and masks for segmentation model. Also it -contains utilities for processing results, for example, ``plot()`` -method for drawing. - -Let us consider the examples: - -.. code:: ipython3 - - from PIL import Image - from ultralytics import YOLO - - SEG_MODEL_NAME = model_name.value - - seg_model = YOLO(f"{SEG_MODEL_NAME}.pt") - label_map = seg_model.model.names - - res = seg_model(IMAGE_PATH) - Image.fromarray(res[0].plot()[:, :, ::-1]) - - -.. parsed-literal:: - - - image 1/1 /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/yolov11-optimization/data/coco_bike.jpg: 480x640 3 bicycles, 2 cars, 1 motorcycle, 1 dog, 68.3ms - Speed: 2.9ms preprocess, 68.3ms inference, 105.2ms postprocess per image at shape (1, 3, 480, 640) - - - - -.. image:: yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_10_1.png - - - -Convert model to OpenVINO IR -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Ultralytics provides API for convenient model exporting to different -formats including OpenVINO IR. ``model.export`` is responsible for model -conversion. We need to specify the format, and additionally, we can -preserve dynamic shapes in the model. - -.. code:: ipython3 - - # instance segmentation model - seg_model_path = Path(f"{SEG_MODEL_NAME}_openvino_model/{SEG_MODEL_NAME}.xml") - if not seg_model_path.exists(): - seg_model.export(format="openvino", dynamic=True, half=True) - -Verify model inference -~~~~~~~~~~~~~~~~~~~~~~ - - - -We can reuse the base model pipeline for pre- and postprocessing just -replacing the inference method where we will use the IR model for -inference. - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -Select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Test on single image -~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - seg_ov_model = core.read_model(seg_model_path) - - ov_config = {} - if device.value != "CPU": - seg_ov_model.reshape({0: [1, 3, 640, 640]}) - if "GPU" in device.value or ("AUTO" in device.value and "GPU" in core.available_devices): - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - seg_compiled_model = core.compile_model(seg_ov_model, device.value, ov_config) - -.. code:: ipython3 - - seg_model = YOLO(seg_model_path.parent, task="segment") - - if seg_model.predictor is None: - custom = {"conf": 0.25, "batch": 1, "save": False, "mode": "predict"} # method defaults - args = {**seg_model.overrides, **custom} - seg_model.predictor = seg_model._smart_load("predictor")(overrides=args, _callbacks=seg_model.callbacks) - seg_model.predictor.setup_model(model=seg_model.model) - - seg_model.predictor.model.ov_compiled_model = seg_compiled_model - - -.. parsed-literal:: - - Ultralytics 8.3.0 🚀 Python-3.8.10 torch-2.4.1+cu121 CUDA:0 (NVIDIA GeForce RTX 3090, 24250MiB) - Loading yolo11n-seg_openvino_model for OpenVINO inference... - Using OpenVINO LATENCY mode for batch=1 inference... - - -.. code:: ipython3 - - res = seg_model(IMAGE_PATH) - Image.fromarray(res[0].plot()[:, :, ::-1]) - - -.. parsed-literal:: - - - image 1/1 /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/yolov11-optimization/data/coco_bike.jpg: 640x640 3 bicycles, 2 cars, 1 dog, 24.4ms - Speed: 3.6ms preprocess, 24.4ms inference, 18.4ms postprocess per image at shape (1, 3, 640, 640) - - - - -.. image:: yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_19_1.png - - - -Great! The result is the same, as produced by original models. - -Optimize model using NNCF Post-training Quantization API --------------------------------------------------------- - - - -`NNCF `__ provides a suite of -advanced algorithms for Neural Networks inference optimization in -OpenVINO with minimal accuracy drop. We will use 8-bit quantization in -post-training mode (without the fine-tuning pipeline) to optimize -YOLOv11. - -The optimization process contains the following steps: - -1. Create a Dataset for quantization. -2. Run ``nncf.quantize`` for getting an optimized model. -3. Serialize OpenVINO IR model, using the ``openvino.runtime.serialize`` - function. - -Please select below whether you would like to run quantization to -improve model inference speed. - -.. code:: ipython3 - - import ipywidgets as widgets - - int8_model_seg_path = Path(f"{SEG_MODEL_NAME}_openvino_int8_model/{SEG_MODEL_NAME}.xml") - quantized_seg_model = None - - to_quantize = widgets.Checkbox( - value=True, - description="Quantization", - disabled=False, - ) - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -Let’s load ``skip magic`` extension to skip quantization if -``to_quantize`` is not selected - -.. code:: ipython3 - - # Fetch skip_kernel_extension module - import requests - - - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - %load_ext skip_kernel_extension - -Reuse validation dataloader in accuracy testing for quantization. For -that, it should be wrapped into the ``nncf.Dataset`` object and define a -transformation function for getting only input tensors. - -.. code:: ipython3 - - # %%skip not $to_quantize.value - - - import nncf - from typing import Dict - - from zipfile import ZipFile - - from ultralytics.data.utils import DATASETS_DIR - from ultralytics.utils import DEFAULT_CFG - from ultralytics.cfg import get_cfg - from ultralytics.data.converter import coco80_to_coco91_class - from ultralytics.data.utils import check_det_dataset - from ultralytics.utils import ops - - - if not int8_model_seg_path.exists(): - DATA_URL = "http://images.cocodataset.org/zips/val2017.zip" - LABELS_URL = "https://github.com/ultralytics/yolov5/releases/download/v1.0/coco2017labels-segments.zip" - CFG_URL = "https://raw.githubusercontent.com/ultralytics/ultralytics/v8.1.0/ultralytics/cfg/datasets/coco.yaml" - - OUT_DIR = DATASETS_DIR - - DATA_PATH = OUT_DIR / "val2017.zip" - LABELS_PATH = OUT_DIR / "coco2017labels-segments.zip" - CFG_PATH = OUT_DIR / "coco.yaml" - - if not (OUT_DIR / "coco/labels").exists(): - download_file(DATA_URL, DATA_PATH.name, DATA_PATH.parent) - download_file(LABELS_URL, LABELS_PATH.name, LABELS_PATH.parent) - download_file(CFG_URL, CFG_PATH.name, CFG_PATH.parent) - with ZipFile(LABELS_PATH, "r") as zip_ref: - zip_ref.extractall(OUT_DIR) - with ZipFile(DATA_PATH, "r") as zip_ref: - zip_ref.extractall(OUT_DIR / "coco/images") - - args = get_cfg(cfg=DEFAULT_CFG) - args.data = str(CFG_PATH) - seg_validator = seg_model.task_map[seg_model.task]["validator"](args=args) - seg_validator.data = check_det_dataset(args.data) - seg_validator.stride = 32 - seg_data_loader = seg_validator.get_dataloader(OUT_DIR / "coco/", 1) - - seg_validator.is_coco = True - seg_validator.class_map = coco80_to_coco91_class() - seg_validator.names = label_map - seg_validator.metrics.names = seg_validator.names - seg_validator.nc = 80 - seg_validator.nm = 32 - seg_validator.process = ops.process_mask - seg_validator.plot_masks = [] - - def transform_fn(data_item: Dict): - """ - Quantization transform function. Extracts and preprocess input data from dataloader item for quantization. - Parameters: - data_item: Dict with data item produced by DataLoader during iteration - Returns: - input_tensor: Input data for quantization - """ - input_tensor = seg_validator.preprocess(data_item)["img"].numpy() - return input_tensor - - quantization_dataset = nncf.Dataset(seg_data_loader, transform_fn) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -The ``nncf.quantize`` function provides an interface for model -quantization. It requires an instance of the OpenVINO Model and -quantization dataset. Optionally, some additional parameters for the -configuration quantization process (number of samples for quantization, -preset, ignored scope, etc.) can be provided. Ultralytics models contain -non-ReLU activation functions, which require asymmetric quantization of -activations. To achieve a better result, we will use a ``mixed`` -quantization preset. It provides symmetric quantization of weights and -asymmetric quantization of activations. For more accurate results, we -should keep the operation in the postprocessing subgraph in floating -point precision, using the ``ignored_scope`` parameter. - - **Note**: Model post-training quantization is time-consuming process. - Be patient, it can take several minutes depending on your hardware. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - if not int8_model_seg_path.exists(): - ignored_scope = nncf.IgnoredScope( # post-processing - subgraphs=[ - nncf.Subgraph(inputs=[f"__module.model.{22 if 'v8' in SEG_MODEL_NAME else 23}/aten::cat/Concat", - f"__module.model.{22 if 'v8' in SEG_MODEL_NAME else 23}/aten::cat/Concat_1", - f"__module.model.{22 if 'v8' in SEG_MODEL_NAME else 23}/aten::cat/Concat_2", - f"__module.model.{22 if 'v8' in SEG_MODEL_NAME else 23}/aten::cat/Concat_7"], - outputs=[f"__module.model.{22 if 'v8' in SEG_MODEL_NAME else 23}/aten::cat/Concat_8"]) - ] - ) - - # Segmentation model - quantized_seg_model = nncf.quantize( - seg_ov_model, - quantization_dataset, - preset=nncf.QuantizationPreset.MIXED, - ignored_scope=ignored_scope - ) - - print(f"Quantized segmentation model will be saved to {int8_model_seg_path}") - ov.save_model(quantized_seg_model, str(int8_model_seg_path)) - -Validate Quantized model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -``nncf.quantize`` returns the OpenVINO Model class instance, which is -suitable for loading on a device for making predictions. ``INT8`` model -input data and output result formats have no difference from the -floating point model representation. Therefore, we can reuse the same -``detect`` function defined above for getting the ``INT8`` model result -on the image. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - device - -.. code:: ipython3 - - %%skip not $to_quantize.value - - if quantized_seg_model is None: - quantized_seg_model = core.read_model(int8_model_seg_path) - - ov_config = {} - if device.value != "CPU": - quantized_seg_model.reshape({0: [1, 3, 640, 640]}) - if "GPU" in device.value or ("AUTO" in device.value and "GPU" in core.available_devices): - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - - quantized_seg_compiled_model = core.compile_model(quantized_seg_model, device.value, ov_config) - -.. code:: ipython3 - - %%skip not $to_quantize.value - - - if seg_model.predictor is None: - custom = {"conf": 0.25, "batch": 1, "save": False, "mode": "predict"} # method defaults - args = {**seg_model.overrides, **custom} - seg_model.predictor = seg_model._smart_load("predictor")(overrides=args, _callbacks=seg_model.callbacks) - seg_model.predictor.setup_model(model=seg_model.model) - - seg_model.predictor.model.ov_compiled_model = quantized_seg_compiled_model - -.. code:: ipython3 - - %%skip not $to_quantize.value - - res = seg_model(IMAGE_PATH) - display(Image.fromarray(res[0].plot()[:, :, ::-1])) - - -.. parsed-literal:: - - - image 1/1 /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/yolov11-optimization/data/coco_bike.jpg: 640x640 2 bicycles, 2 cars, 1 dog, 17.9ms - Speed: 2.8ms preprocess, 17.9ms inference, 15.1ms postprocess per image at shape (1, 3, 640, 640) - - - -.. image:: yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_34_1.png - - -Compare the Original and Quantized Models ------------------------------------------ - - - -Compare performance of the Original and Quantized Models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Finally, use the OpenVINO -`Benchmark -Tool `__ -to measure the inference performance of the ``FP32`` and ``INT8`` -models. - - **Note**: For more accurate performance, it is recommended to run - ``benchmark_app`` in a terminal/command prompt after closing other - applications. Run - ``benchmark_app -m -d CPU -shape ""`` to - benchmark async inference on CPU on specific input data shape for one - minute. Change ``CPU`` to ``GPU`` to benchmark on GPU. Run - ``benchmark_app --help`` to see an overview of all command-line - options. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - device - -.. code:: ipython3 - - if int8_model_seg_path.exists(): - !benchmark_app -m $seg_model_path -d $device.value -api async -shape "[1,3,640,640]" -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.5.0-16814-e1c167a841c - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.5.0-16814-e1c167a841c - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 22.19 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [?,3,?,?] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_8) : f32 / [...] / [?,116,21..] - [ INFO ] input.255 (node: __module.model.23.cv4.2.1.act/aten::silu_/Swish_46) : f32 / [...] / [?,32,8..,8..] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,640,640] - [ INFO ] Reshape model took 13.32 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_8) : f32 / [...] / [1,116,8400] - [ INFO ] input.255 (node: __module.model.23.cv4.2.1.act/aten::silu_/Swish_46) : f32 / [...] / [1,32,160,160] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 543.18 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 15000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 40.05 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 2028 iterations - [ INFO ] Duration: 15090.79 ms - [ INFO ] Latency: - [ INFO ] Median: 85.36 ms - [ INFO ] Average: 89.01 ms - [ INFO ] Min: 58.65 ms - [ INFO ] Max: 200.05 ms - [ INFO ] Throughput: 134.39 FPS - - -.. code:: ipython3 - - if int8_model_seg_path.exists(): - !benchmark_app -m $int8_model_seg_path -d $device.value -api async -shape "[1,3,640,640]" -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.5.0-16814-e1c167a841c - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.5.0-16814-e1c167a841c - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 40.03 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_8) : f32 / [...] / [1,116,8400] - [ INFO ] input.255 (node: __module.model.23.cv4.2.1.act/aten::silu_/Swish_46) : f32 / [...] / [1,32,160,160] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,640,640] - [ INFO ] Reshape model took 0.05 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_8) : f32 / [...] / [1,116,8400] - [ INFO ] input.255 (node: __module.model.23.cv4.2.1.act/aten::silu_/Swish_46) : f32 / [...] / [1,32,160,160] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 821.20 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 15000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 23.59 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 4392 iterations - [ INFO ] Duration: 15061.13 ms - [ INFO ] Latency: - [ INFO ] Median: 38.36 ms - [ INFO ] Average: 40.98 ms - [ INFO ] Min: 28.65 ms - [ INFO ] Max: 110.14 ms - [ INFO ] Throughput: 291.61 FPS - - -Other ways to optimize model ----------------------------- - - - -The performance could be also improved by another OpenVINO method such -as async inference pipeline or preprocessing API. - -Async Inference pipeline help to utilize the device more optimal. The -key advantage of the Async API is that when a device is busy with -inference, the application can perform other tasks in parallel (for -example, populating inputs or scheduling other requests) rather than -wait for the current inference to complete first. To understand how to -perform async inference using openvino, refer to `Async API -tutorial `__ - -Preprocessing API enables making preprocessing a part of the model -reducing application code and dependency on additional image processing -libraries. The main advantage of Preprocessing API is that preprocessing -steps will be integrated into the execution graph and will be performed -on a selected device (CPU/GPU etc.) rather than always being executed on -CPU as part of an application. This will also improve selected device -utilization. For more information, refer to the overview of -`Preprocessing API -tutorial `__. To -see, how it could be used with YOLOV8 object detection model, please, -see `Convert and Optimize YOLOv8 real-time object detection with -OpenVINO tutorial `__ - -Live demo ---------- - - - -The following code runs model inference on a video: - -.. code:: ipython3 - - import collections - import time - import cv2 - from IPython import display - import numpy as np - - - def run_instance_segmentation( - source=0, - flip=False, - use_popup=False, - skip_first_frames=0, - model=seg_model, - device=device.value, - video_width: int = None, # if not set the original size is used - ): - player = None - - ov_config = {} - if device != "CPU": - model.reshape({0: [1, 3, 640, 640]}) - if "GPU" in device or ("AUTO" in device and "GPU" in core.available_devices): - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - compiled_model = core.compile_model(model, device, ov_config) - - if seg_model.predictor is None: - custom = {"conf": 0.25, "batch": 1, "save": False, "mode": "predict"} # method defaults - args = {**seg_model.overrides, **custom} - seg_model.predictor = seg_model._smart_load("predictor")(overrides=args, _callbacks=seg_model.callbacks) - seg_model.predictor.setup_model(model=seg_model.model) - - seg_model.predictor.model.ov_compiled_model = compiled_model - - try: - # Create a video player to play with target fps. - player = VideoPlayer(source=source, flip=flip, fps=30, skip_first_frames=skip_first_frames) - # Start capturing. - player.start() - if use_popup: - title = "Press ESC to Exit" - cv2.namedWindow(winname=title, flags=cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE) - - processing_times = collections.deque() - while True: - # Grab the frame. - frame = player.next() - if frame is None: - print("Source ended") - break - - if video_width: - # If the frame is larger than video_width, reduce size to improve the performance. - # If more, increase size for better demo expirience. - scale = video_width / max(frame.shape) - frame = cv2.resize( - src=frame, - dsize=None, - fx=scale, - fy=scale, - interpolation=cv2.INTER_AREA, - ) - - # Get the results. - input_image = np.array(frame) - - start_time = time.time() - detections = seg_model(input_image, verbose=False) - stop_time = time.time() - frame = detections[0].plot() - - processing_times.append(stop_time - start_time) - # Use processing times from last 200 frames. - if len(processing_times) > 200: - processing_times.popleft() - - _, f_width = frame.shape[:2] - # Mean processing time [ms]. - processing_time = np.mean(processing_times) * 1000 - fps = 1000 / processing_time - cv2.putText( - img=frame, - text=f"Inference time: {processing_time:.1f}ms ({fps:.1f} FPS)", - org=(20, 40), - fontFace=cv2.FONT_HERSHEY_COMPLEX, - fontScale=f_width / 1000, - color=(0, 0, 255), - thickness=1, - lineType=cv2.LINE_AA, - ) - # Use this workaround if there is flickering. - if use_popup: - cv2.imshow(winname=title, mat=frame) - key = cv2.waitKey(1) - # escape = 27 - if key == 27: - break - else: - # Encode numpy array to jpg. - _, encoded_img = cv2.imencode(ext=".jpg", img=frame, params=[cv2.IMWRITE_JPEG_QUALITY, 100]) - # Create an IPython image. - i = display.Image(data=encoded_img) - # Display the image in this notebook. - display.clear_output(wait=True) - display.display(i) - # ctrl-c - except KeyboardInterrupt: - print("Interrupted") - # any different error - except RuntimeError as e: - print(e) - finally: - if player is not None: - # Stop capturing. - player.stop() - if use_popup: - cv2.destroyAllWindows() - -Run Live Object Detection and Segmentation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Use a webcam as the video input. By default, the primary webcam is set -with ``source=0``. If you have multiple webcams, each one will be -assigned a consecutive number starting at 0. Set ``flip=True`` when -using a front-facing camera. Some web browsers, especially Mozilla -Firefox, may cause flickering. If you experience flickering, -set ``use_popup=True``. - - **NOTE**: To use this notebook with a webcam, you need to run the - notebook on a computer with a webcam. If you run the notebook on a - remote server (for example, in Binder or Google Colab service), the - webcam will not work. By default, the lower cell will run model - inference on a video file. If you want to try live inference on your - webcam set ``WEBCAM_INFERENCE = True`` - -.. code:: ipython3 - - WEBCAM_INFERENCE = False - - if WEBCAM_INFERENCE: - VIDEO_SOURCE = 0 # Webcam - else: - VIDEO_SOURCE = Path("people.mp4") - if VIDEO_SOURCE.exists(): - download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/people.mp4", - "people.mp4", - ) - -.. code:: ipython3 - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - run_instance_segmentation( - source=VIDEO_SOURCE, - flip=True, - use_popup=False, - model=seg_ov_model, - device=device.value, - # video_width=1280, - ) - - - -.. image:: yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_46_0.png - - -.. parsed-literal:: - - Source ended - diff --git a/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_10_1.jpg b/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_10_1.jpg deleted file mode 100644 index 2b3cd05dafc1d2..00000000000000 --- a/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_10_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d628338cc3e26bb5c96c0637f19fb656ecece610932cc7b2ca6d246ceb451f23 -size 85225 diff --git a/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_10_1.png b/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_10_1.png deleted file mode 100644 index cbc54f8cf88f94..00000000000000 --- a/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_10_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b2aa97c1dbead003bc9df7b129c5edb8680c3436fb8a6009a20aa76249c22f6d -size 773912 diff --git a/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_19_1.jpg b/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_19_1.jpg deleted file mode 100644 index 22886c54cd44db..00000000000000 --- a/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_19_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9cd9c6e6398837315b6bc57c03f059a1299204de2990c9c3e3ff8bbdb7432d66 -size 85069 diff --git a/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_19_1.png b/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_19_1.png deleted file mode 100644 index 123911efcb1f68..00000000000000 --- a/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_19_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5b649babe11466c4ba5ffe65e0d5cb5cbfd1609f3c8c027b8e4cee51e9f34dc0 -size 781439 diff --git a/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_34_1.jpg b/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_34_1.jpg deleted file mode 100644 index fef271afdbc7d5..00000000000000 --- a/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_34_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:33c499fe056e57956b48dcc00c9c296a77dc87bcbaf4d191dff14838040f93ce -size 84997 diff --git a/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_34_1.png b/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_34_1.png deleted file mode 100644 index 84af202e294894..00000000000000 --- a/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_34_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6585b04c1a519b056ce95e4205f04000db757cc9d64f53194d3a65dd4578cdf5 -size 782898 diff --git a/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_46_0.png b/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_46_0.png deleted file mode 100644 index d892e13c8c0591..00000000000000 --- a/docs/notebooks/yolov11-instance-segmentation-with-output_files/yolov11-instance-segmentation-with-output_46_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:54170dc0bf1dd18c44fa5148110bfa843df4e2f92bcb5622357d52663429f370 -size 500065 diff --git a/docs/notebooks/yolov11-keypoint-detection-with-output.rst b/docs/notebooks/yolov11-keypoint-detection-with-output.rst deleted file mode 100644 index 2b549ee3eae97f..00000000000000 --- a/docs/notebooks/yolov11-keypoint-detection-with-output.rst +++ /dev/null @@ -1,1126 +0,0 @@ -Convert and Optimize YOLOv11 keypoint detection model with OpenVINO™ -==================================================================== - -Keypoint detection/Pose is a task that involves detecting specific -points in an image or video frame. These points are referred to as -keypoints and are used to track movement or pose estimation. YOLOv11 can -detect keypoints in an image or video frame with high accuracy and -speed. - -This tutorial demonstrates step-by-step instructions on how to run and -optimize `PyTorch YOLOv11 Pose -model `__ with OpenVINO. We -consider the steps required for keypoint detection scenario. You can -find more details about model on `model -page `__ in Ultralytics -documentation. - -The tutorial consists of the following steps: - Prepare the PyTorch -model. - Download and prepare a dataset. - Validate the original model. -- Convert the PyTorch model to OpenVINO IR. - Validate the converted -model. - Prepare and run optimization pipeline. - Compare performance of -the FP32 and quantized models. - Compare accuracy of the FP32 and -quantized models. - Live demo - - -**Table of contents:** - - -- `Get PyTorch model <#get-pytorch-model>`__ - - - `Prerequisites <#prerequisites>`__ - -- `Instantiate model <#instantiate-model>`__ - - - `Convert model to OpenVINO IR <#convert-model-to-openvino-ir>`__ - - `Verify model inference <#verify-model-inference>`__ - - `Select inference device <#select-inference-device>`__ - - `Test on single image <#test-on-single-image>`__ - -- `Optimize model using NNCF Post-training Quantization - API <#optimize-model-using-nncf-post-training-quantization-api>`__ - - - `Validate Quantized model - inference <#validate-quantized-model-inference>`__ - -- `Compare the Original and Quantized - Models <#compare-the-original-and-quantized-models>`__ - - - `Compare performance of the Original and Quantized - Models <#compare-performance-of-the-original-and-quantized-models>`__ - -- `Other ways to optimize model <#other-ways-to-optimize-model>`__ -- `Live demo <#live-demo>`__ - - - `Run Keypoint Detection on - video <#run-keypoint-detection-on-video>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Get PyTorch model ------------------ - - - -Generally, PyTorch models represent an instance of the -`torch.nn.Module `__ -class, initialized by a state dictionary with model weights. We will use -the YOLOv11 nano model (also known as ``yolo11n-pose``) pre-trained on a -COCO dataset, which is available in this -`repo `__. Similar steps are -also applicable to other Ultralytics models. Typical steps to obtain a -pre-trained model: 1. Create an instance of a model class. 2. Load a -checkpoint state dict, which contains the pre-trained model weights. 3. -Turn the model to evaluation for switching some operations to inference -mode. - -In this case, the creators of the model provide an API that enables -converting the YOLOv11 model to OpenVINO IR. Therefore, we do not need -to do these steps manually. - -Prerequisites -^^^^^^^^^^^^^ - - - -Install necessary packages. - -.. code:: ipython3 - - %pip install -q "openvino>=2024.0.0" "nncf>=2.9.0" - %pip install -q "protobuf==3.20.*" "torch>=2.1" "torchvision>=0.16" "ultralytics==8.3.59" tqdm opencv-python --extra-index-url https://download.pytorch.org/whl/cpu - -Import required utility functions. The lower cell will download the -``notebook_utils`` Python module from GitHub. - -.. code:: ipython3 - - from pathlib import Path - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import download_file, VideoPlayer, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("yolov11-keypoint-detection.ipynb") - -.. code:: ipython3 - - # Download a test sample - IMAGE_PATH = Path("./data/intel_rnb.jpg") - - if not IMAGE_PATH.exists(): - download_file( - url="https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/intel_rnb.jpg", - filename=IMAGE_PATH.name, - directory=IMAGE_PATH.parent, - ) - - - -.. parsed-literal:: - - data/intel_rnb.jpg: 0%| | 0.00/288k [00:00`__ provides a suite of -advanced algorithms for Neural Networks inference optimization in -OpenVINO with minimal accuracy drop. We will use 8-bit quantization in -post-training mode (without the fine-tuning pipeline) to optimize -YOLOv8. - -The optimization process contains the following steps: - -1. Create a Dataset for quantization. -2. Run ``nncf.quantize`` for getting an optimized model. -3. Serialize OpenVINO IR model, using the ``openvino.runtime.serialize`` - function. - -Please select below whether you would like to run quantization to -improve model inference speed. - -.. code:: ipython3 - - import ipywidgets as widgets - - int8_model_pose_path = Path(f"{POSE_MODEL_NAME}_openvino_int8_model/{POSE_MODEL_NAME}.xml") - quantized_pose_model = None - - to_quantize = widgets.Checkbox( - value=True, - description="Quantization", - disabled=False, - ) - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -Let’s load ``skip magic`` extension to skip quantization if -``to_quantize`` is not selected - -.. code:: ipython3 - - # Fetch skip_kernel_extension module - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - %load_ext skip_kernel_extension - -Reuse validation dataloader in accuracy testing for quantization. For -that, it should be wrapped into the ``nncf.Dataset`` object and define a -transformation function for getting only input tensors. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import nncf - from typing import Dict - - from zipfile import ZipFile - - from ultralytics.data.utils import DATASETS_DIR - from ultralytics.utils import DEFAULT_CFG - from ultralytics.cfg import get_cfg - from ultralytics.data.utils import check_det_dataset - from ultralytics.models.yolo.pose import PoseValidator - from ultralytics.utils.metrics import OKS_SIGMA - - if not int8_model_pose_path.exists(): - - DATA_URL = "https://ultralytics.com/assets/coco8-pose.zip" - CFG_URL = "https://raw.githubusercontent.com/ultralytics/ultralytics/v8.1.0/ultralytics/cfg/datasets/coco8-pose.yaml" - - OUT_DIR = DATASETS_DIR - - DATA_PATH = OUT_DIR / "val2017.zip" - CFG_PATH = OUT_DIR / "coco8-pose.yaml" - - if not (OUT_DIR / "coco8-pose/labels").exists(): - download_file(DATA_URL, DATA_PATH.name, DATA_PATH.parent) - download_file(CFG_URL, CFG_PATH.name, CFG_PATH.parent) - with ZipFile(DATA_PATH, "r") as zip_ref: - zip_ref.extractall(OUT_DIR) - - args = get_cfg(cfg=DEFAULT_CFG) - args.data = "coco8-pose.yaml" - - pose_validator = PoseValidator(args=args) - pose_validator.data = check_det_dataset(args.data) - pose_validator.stride = 32 - pose_data_loader = pose_validator.get_dataloader(OUT_DIR / "coco8-pose", 1) - - pose_validator.is_coco = True - pose_validator.names = label_map - pose_validator.metrics.names = pose_validator.names - pose_validator.nc = 1 - pose_validator.sigma = OKS_SIGMA - - - def transform_fn(data_item:Dict): - """ - Quantization transform function. Extracts and preprocess input data from dataloader item for quantization. - Parameters: - data_item: Dict with data item produced by DataLoader during iteration - Returns: - input_tensor: Input data for quantization - """ - input_tensor = pose_validator.preprocess(data_item)['img'].numpy() - return input_tensor - - - quantization_dataset = nncf.Dataset(pose_data_loader, transform_fn) - - -.. parsed-literal:: - - '/home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/yolov8-optimization/datasets/val2017.zip' already exists. - - - -.. parsed-literal:: - - /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/yolov8-optimization/datasets/coco8-pos… - - -.. parsed-literal:: - - val: Scanning /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/yolov8-optimization/datasets/coco8-pose/labels/train.cache... 8 images, 0 backgrounds, 0 corrupt: 100%|██████████| 8/8 [00:00`__ -to measure the inference performance of the ``FP32`` and ``INT8`` -models. - - **Note**: For more accurate performance, it is recommended to run - ``benchmark_app`` in a terminal/command prompt after closing other - applications. Run - ``benchmark_app -m -d CPU -shape ""`` to - benchmark async inference on CPU on specific input data shape for one - minute. Change ``CPU`` to ``GPU`` to benchmark on GPU. Run - ``benchmark_app --help`` to see an overview of all command-line - options. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - device - -.. code:: ipython3 - - if int8_model_pose_path.exists(): - # Inference FP32 model (OpenVINO IR) - !benchmark_app -m $pose_model_path -d $device.value -api async -shape "[1,3,640,640]" -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ WARNING ] Default duration 120 seconds is used for unknown device AUTO - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.5.0-16814-e1c167a841c - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.5.0-16814-e1c167a841c - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 21.31 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [?,3,?,?] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_9) : f32 / [...] / [?,56,21..] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,640,640] - [ INFO ] Reshape model took 8.96 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_9) : f32 / [...] / [1,56,8400] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 419.55 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 120000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 34.60 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 20292 iterations - [ INFO ] Duration: 120102.20 ms - [ INFO ] Latency: - [ INFO ] Median: 68.35 ms - [ INFO ] Average: 70.85 ms - [ INFO ] Min: 43.18 ms - [ INFO ] Max: 167.33 ms - [ INFO ] Throughput: 168.96 FPS - - -.. code:: ipython3 - - if int8_model_pose_path.exists(): - # Inference INT8 model (OpenVINO IR) - !benchmark_app -m $int8_model_pose_path -d $device.value -api async -shape "[1,3,640,640]" -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.5.0-16814-e1c167a841c - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.5.0-16814-e1c167a841c - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 32.88 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_9) : f32 / [...] / [1,56,8400] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,640,640] - [ INFO ] Reshape model took 0.05 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_9) : f32 / [...] / [1,56,8400] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 681.22 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 18 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 18 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 18 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 18 inference requests, limits: 15000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 25.13 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 6480 iterations - [ INFO ] Duration: 15069.23 ms - [ INFO ] Latency: - [ INFO ] Median: 39.96 ms - [ INFO ] Average: 41.67 ms - [ INFO ] Min: 29.99 ms - [ INFO ] Max: 117.58 ms - [ INFO ] Throughput: 430.02 FPS - - -Compare accuracy of the Original and Quantized Models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -As we can see, there is no significant difference between ``INT8`` and -float model result in a single image test. To understand how -quantization influences model prediction precision, we can compare model -accuracy on a dataset. - -Other ways to optimize model ----------------------------- - - - -The performance could be also improved by another OpenVINO method such -as async inference pipeline or preprocessing API. - -Async Inference pipeline help to utilize the device more optimal. The -key advantage of the Async API is that when a device is busy with -inference, the application can perform other tasks in parallel (for -example, populating inputs or scheduling other requests) rather than -wait for the current inference to complete first. To understand how to -perform async inference using openvino, refer to `Async API -tutorial `__ - -Preprocessing API enables making preprocessing a part of the model -reducing application code and dependency on additional image processing -libraries. The main advantage of Preprocessing API is that preprocessing -steps will be integrated into the execution graph and will be performed -on a selected device (CPU/GPU etc.) rather than always being executed on -CPU as part of an application. This will also improve selected device -utilization. For more information, refer to the overview of -`Preprocessing API -tutorial `__. To -see, how it could be used with YOLOV8 object detection model , please, -see `Convert and Optimize YOLOv8 real-time object detection with -OpenVINO tutorial `__ - -Live demo ---------- - - - -The following code runs model inference on a video: - -.. code:: ipython3 - - import collections - import time - from IPython import display - import cv2 - import numpy as np - - - def run_keypoint_detection( - source=0, - flip=False, - use_popup=False, - skip_first_frames=0, - model=pose_model, - device=device.value, - ): - player = None - - ov_config = {} - if device != "CPU": - model.reshape({0: [1, 3, 640, 640]}) - if "GPU" in device or ("AUTO" in device and "GPU" in core.available_devices): - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - compiled_model = core.compile_model(model, device, ov_config) - - if pose_model.predictor is None: - custom = {"conf": 0.25, "batch": 1, "save": False, "mode": "predict"} # method defaults - args = {**seg_model.overrides, **custom} - pose_model.predictor = pose_model._smart_load("predictor")(overrides=args, _callbacks=pose_model.callbacks) - pose_model.predictor.setup_model(model=pose_model.model) - - pose_model.predictor.model.ov_compiled_model = compiled_model - - try: - # Create a video player to play with target fps. - player = VideoPlayer(source=source, flip=flip, fps=30, skip_first_frames=skip_first_frames) - # Start capturing. - player.start() - if use_popup: - title = "Press ESC to Exit" - cv2.namedWindow(winname=title, flags=cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE) - - processing_times = collections.deque() - while True: - # Grab the frame. - frame = player.next() - if frame is None: - print("Source ended") - break - # If the frame is larger than full HD, reduce size to improve the performance. - scale = 1280 / max(frame.shape) - if scale < 1: - frame = cv2.resize( - src=frame, - dsize=None, - fx=scale, - fy=scale, - interpolation=cv2.INTER_AREA, - ) - # Get the results - input_image = np.array(frame) - - start_time = time.time() - - detections = pose_model(input_image, verbose=False) - stop_time = time.time() - frame = detections[0].plot() - - processing_times.append(stop_time - start_time) - # Use processing times from last 200 frames. - if len(processing_times) > 200: - processing_times.popleft() - - _, f_width = frame.shape[:2] - # Mean processing time [ms]. - processing_time = np.mean(processing_times) * 1000 - fps = 1000 / processing_time - cv2.putText( - img=frame, - text=f"Inference time: {processing_time:.1f}ms ({fps:.1f} FPS)", - org=(20, 40), - fontFace=cv2.FONT_HERSHEY_COMPLEX, - fontScale=f_width / 1000, - color=(0, 0, 255), - thickness=1, - lineType=cv2.LINE_AA, - ) - # Use this workaround if there is flickering. - if use_popup: - cv2.imshow(winname=title, mat=frame) - key = cv2.waitKey(1) - # escape = 27 - if key == 27: - break - else: - # Encode numpy array to jpg. - _, encoded_img = cv2.imencode(ext=".jpg", img=frame, params=[cv2.IMWRITE_JPEG_QUALITY, 100]) - # Create an IPython image. - i = display.Image(data=encoded_img) - # Display the image in this notebook. - display.clear_output(wait=True) - display.display(i) - # ctrl-c - except KeyboardInterrupt: - print("Interrupted") - # any different error - except RuntimeError as e: - print(e) - finally: - if player is not None: - # Stop capturing. - player.stop() - if use_popup: - cv2.destroyAllWindows() - -Run Keypoint Detection on video -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - # VIDEO_SOURCE = 0 #for webcam - VIDEO_URL = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/people.mp4" - VIDEO_SOURCE = Path("people.mp4") - - if not VIDEO_SOURCE.exists(): - download_file(VIDEO_URL) - -.. code:: ipython3 - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - run_keypoint_detection( - source=VIDEO_SOURCE, - flip=True, - use_popup=False, - model=pose_ov_model, - device=device.value, - ) - - - -.. image:: yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_43_0.png - - -.. parsed-literal:: - - Source ended - diff --git a/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_16_1.jpg b/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_16_1.jpg deleted file mode 100644 index 0fe619b4d25368..00000000000000 --- a/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_16_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:48d9b79f4e9821906a5550d2872da58719e5fdf14f21ec386d73a11f21466293 -size 59324 diff --git a/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_16_1.png b/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_16_1.png deleted file mode 100644 index 911d0dc24506c1..00000000000000 --- a/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_16_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b0ebeed7fce17f3ff093d8a8c3ed5f20e4caf98957b7c807215eb5f58c72b851 -size 581548 diff --git a/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_30_1.jpg b/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_30_1.jpg deleted file mode 100644 index 0fe619b4d25368..00000000000000 --- a/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_30_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:48d9b79f4e9821906a5550d2872da58719e5fdf14f21ec386d73a11f21466293 -size 59324 diff --git a/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_30_1.png b/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_30_1.png deleted file mode 100644 index 911d0dc24506c1..00000000000000 --- a/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_30_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b0ebeed7fce17f3ff093d8a8c3ed5f20e4caf98957b7c807215eb5f58c72b851 -size 581548 diff --git a/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_43_0.png b/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_43_0.png deleted file mode 100644 index 45cce64ea72285..00000000000000 --- a/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_43_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b4e061eeed8078e471e03884db44ceb1d49c3985a34e4d3456e79985fd0db21e -size 513057 diff --git a/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_9_3.jpg b/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_9_3.jpg deleted file mode 100644 index 036407491e7f8f..00000000000000 --- a/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_9_3.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8b389ac8525169a7070c184be29337d4a7af221d63e9ff6a089683064c1d4389 -size 59379 diff --git a/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_9_3.png b/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_9_3.png deleted file mode 100644 index 76ea9232fe5123..00000000000000 --- a/docs/notebooks/yolov11-keypoint-detection-with-output_files/yolov11-keypoint-detection-with-output_9_3.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1f91e9067fabaaa5433f0866617c33e508eaa7d10af97bec52e77b25529c13cb -size 581985 diff --git a/docs/notebooks/yolov11-object-detection-with-output.rst b/docs/notebooks/yolov11-object-detection-with-output.rst deleted file mode 100644 index 0489fc356a26c6..00000000000000 --- a/docs/notebooks/yolov11-object-detection-with-output.rst +++ /dev/null @@ -1,966 +0,0 @@ -Convert and Optimize YOLOv11 real-time object detection with OpenVINO™ -====================================================================== - -Real-time object detection is often used as a key component in computer -vision systems. Applications that use real-time object detection models -include video analytics, robotics, autonomous vehicles, multi-object -tracking and object counting, medical image analysis, and many others. - -This tutorial demonstrates step-by-step instructions on how to run and -optimize PyTorch YOLOv11 with OpenVINO. We consider the steps required -for object detection scenario. You can find more details about model on -`model page `__ in -Ultralytics documentation - -The tutorial consists of the following steps: - Prepare the PyTorch -model. - Download and prepare a dataset. - Validate the original model. -- Convert the PyTorch model to OpenVINO IR. - Prepare and run -optimization pipeline. - Compare performance of the FP32 and quantized -models. - Other optimization possibilities with OpenVINO api - Live demo - - -**Table of contents:** - - -- `Get PyTorch model <#get-pytorch-model>`__ - - - `Prerequisites <#prerequisites>`__ - -- `Instantiate model <#instantiate-model>`__ - - - `Convert model to OpenVINO IR <#convert-model-to-openvino-ir>`__ - - `Verify model inference <#verify-model-inference>`__ - - `Select inference device <#select-inference-device>`__ - - `Test on single image <#test-on-single-image>`__ - -- `Optimize model using NNCF Post-training Quantization - API <#optimize-model-using-nncf-post-training-quantization-api>`__ - - - `Validate Quantized model - inference <#validate-quantized-model-inference>`__ - -- `Compare the Original and Quantized - Models <#compare-the-original-and-quantized-models>`__ - - - `Compare performance object detection - models <#compare-performance-object-detection-models>`__ - -- `Next steps <#next-steps>`__ - - - `Async inference pipeline <#async-inference-pipeline>`__ - -- `Live demo <#live-demo>`__ - - - `Run Live Object Detection <#run-live-object-detection>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Get PyTorch model ------------------ - - - -Generally, PyTorch models represent an instance of the -`torch.nn.Module `__ -class, initialized by a state dictionary with model weights. We will use -the YOLOv11 nano model (also known as ``yolo11n``) pre-trained on a COCO -dataset, which is available in this -`repo `__. Similar steps are -also applicable to other YOLOv11 models. Typical steps to obtain a -pre-trained model: 1. Create an instance of a model class. 2. Load a -checkpoint state dict, which contains the pre-trained model weights. 3. -Turn the model to evaluation for switching some operations to inference -mode. - -In this case, the creators of the model provide an API that enables -converting the YOLOv11 model to OpenVINO IR. Therefore, we do not need -to do these steps manually. - -Prerequisites -^^^^^^^^^^^^^ - - - -Install necessary packages. - -.. code:: ipython3 - - %pip install -q "openvino>=2024.0.0" "nncf>=2.9.0" - %pip install -q "torch>=2.1" "torchvision>=0.16" "ultralytics==8.3.59" onnx tqdm opencv-python --extra-index-url https://download.pytorch.org/whl/cpu - -Import required utility functions. The lower cell will download the -``notebook_utils`` Python module from GitHub. - -.. code:: ipython3 - - from pathlib import Path - import requests - - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import download_file, VideoPlayer, device_widget, quantization_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("yolov11-object-detection.ipynb") - -.. code:: ipython3 - - # Download a test sample - IMAGE_PATH = Path("./data/coco_bike.jpg") - - if not IMAGE_PATH.exists(): - download_file( - url="https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bike.jpg", - filename=IMAGE_PATH.name, - directory=IMAGE_PATH.parent, - ) - - -.. parsed-literal:: - - 'data/coco_bike.jpg' already exists. - - - - -.. parsed-literal:: - - PosixPath('/home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/yolov11-optimization/data/coco_bike.jpg') - - - -Instantiate model ------------------ - - - -There are `several -models `__ available in the -original repository, targeted for different tasks. For loading the -model, required to specify a path to the model checkpoint. It can be -some local path or name available on models hub (in this case model -checkpoint will be downloaded automatically). - -You can select one of represented model using widget bellow: - -.. code:: ipython3 - - import ipywidgets as widgets - - model_id = ["yolo11n", "yolo11s", "yolo11m", "yolo11l", "yolo11x", "yolov8n", "yolov8s", "yolov8m", "yolov8l", "yolov8x"] - - model_name = widgets.Dropdown(options=model_id, value=model_id[0], description="Model") - - model_name - - - - -.. parsed-literal:: - - Dropdown(description='Model', options=('yolo11n', 'yolo11s', 'yolo11m', 'yolo11l', 'yolo11x', 'yolov8n', 'yolo… - - - -Making prediction, the model accepts a path to input image and returns -list with Results class object. Results contains boxes for object -detection model. Also it contains utilities for processing results, for -example, ``plot()`` method for drawing. - -Let us consider the examples: - -.. code:: ipython3 - - from PIL import Image - from ultralytics import YOLO - - DET_MODEL_NAME = model_name.value - - det_model = YOLO(f"{DET_MODEL_NAME}.pt") - det_model.to("cpu") - label_map = det_model.model.names - - res = det_model(IMAGE_PATH) - Image.fromarray(res[0].plot()[:, :, ::-1]) - - -.. parsed-literal:: - - - image 1/1 /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/yolov11-optimization/data/coco_bike.jpg: 480x640 2 bicycles, 2 cars, 2 dogs, 102.3ms - Speed: 3.0ms preprocess, 102.3ms inference, 1.3ms postprocess per image at shape (1, 3, 480, 640) - - - - -.. image:: yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_10_1.png - - - -Convert model to OpenVINO IR -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Ultralytics provides API for convenient model exporting to different -formats including OpenVINO IR. ``model.export`` is responsible for model -conversion. We need to specify the format, and additionally, we can -preserve dynamic shapes in the model. - -.. code:: ipython3 - - # object detection model - det_model_path = Path(f"{DET_MODEL_NAME}_openvino_model/{DET_MODEL_NAME}.xml") - if not det_model_path.exists(): - det_model.export(format="openvino", dynamic=True, half=True) - -Verify model inference -~~~~~~~~~~~~~~~~~~~~~~ - - - -We can reuse the base model pipeline for pre- and postprocessing just -replacing the inference method where we will use the IR model for -inference. - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -Select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Test on single image -~~~~~~~~~~~~~~~~~~~~ - - - -Now, once we have defined preprocessing and postprocessing steps, we are -ready to check model prediction for object detection. - -.. code:: ipython3 - - import openvino as ov - - core = ov.Core() - - det_ov_model = core.read_model(det_model_path) - - ov_config = {} - if device.value != "CPU": - det_ov_model.reshape({0: [1, 3, 640, 640]}) - if "GPU" in device.value or ("AUTO" in device.value and "GPU" in core.available_devices): - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - det_compiled_model = core.compile_model(det_ov_model, device.value, ov_config) - det_model = YOLO(det_model_path.parent, task="detect") - - if det_model.predictor is None: - custom = {"conf": 0.25, "batch": 1, "save": False, "mode": "predict"} # method defaults - args = {**det_model.overrides, **custom} - det_model.predictor = det_model._smart_load("predictor")(overrides=args, _callbacks=det_model.callbacks) - det_model.predictor.setup_model(model=det_model.model) - - det_model.predictor.model.ov_compiled_model = det_compiled_model - - res = det_model(IMAGE_PATH) - Image.fromarray(res[0].plot()[:, :, ::-1]) - - -.. parsed-literal:: - - Ultralytics 8.3.0 🚀 Python-3.8.10 torch-2.4.1+cu121 CUDA:0 (NVIDIA GeForce RTX 3090, 24250MiB) - Loading yolo11n_openvino_model for OpenVINO inference... - Using OpenVINO LATENCY mode for batch=1 inference... - - image 1/1 /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/yolov11-optimization/data/coco_bike.jpg: 640x640 1 bicycle, 2 cars, 1 dog, 18.2ms - Speed: 3.3ms preprocess, 18.2ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640) - - - - -.. image:: yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_17_1.png - - - -Optimize model using NNCF Post-training Quantization API --------------------------------------------------------- - - - -`NNCF `__ provides a suite of -advanced algorithms for Neural Networks inference optimization in -OpenVINO with minimal accuracy drop. We will use 8-bit quantization in -post-training mode (without the fine-tuning pipeline) to optimize model. - -The optimization process contains the following steps: - -1. Create a Dataset for quantization. -2. Run ``nncf.quantize`` for getting an optimized model. -3. Serialize OpenVINO IR model, using the ``openvino.runtime.serialize`` - function. - -Please select below whether you would like to run quantization to -improve model inference speed. - -.. code:: ipython3 - - int8_model_det_path = Path(f"{DET_MODEL_NAME}_openvino_int8_model/{DET_MODEL_NAME}.xml") - quantized_det_model = None - - to_quantize = quantization_widget() - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -Let’s load ``skip magic`` extension to skip quantization if -``to_quantize`` is not selected - -.. code:: ipython3 - - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - %load_ext skip_kernel_extension - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from ultralytics.utils import DEFAULT_CFG - from ultralytics.cfg import get_cfg - from ultralytics.data.converter import coco80_to_coco91_class - from ultralytics.data.utils import check_det_dataset - from zipfile import ZipFile - - from ultralytics.data.utils import DATASETS_DIR - - - DATA_URL = "http://images.cocodataset.org/zips/val2017.zip" - LABELS_URL = "https://github.com/ultralytics/yolov5/releases/download/v1.0/coco2017labels-segments.zip" - CFG_URL = "https://raw.githubusercontent.com/ultralytics/ultralytics/v8.1.0/ultralytics/cfg/datasets/coco.yaml" - - OUT_DIR = DATASETS_DIR - - DATA_PATH = OUT_DIR / "val2017.zip" - LABELS_PATH = OUT_DIR / "coco2017labels-segments.zip" - CFG_PATH = OUT_DIR / "coco.yaml" - - if not int8_model_det_path.exists(): - if not (OUT_DIR / "coco/labels").exists(): - download_file(DATA_URL, DATA_PATH.name, DATA_PATH.parent) - download_file(LABELS_URL, LABELS_PATH.name, LABELS_PATH.parent) - download_file(CFG_URL, CFG_PATH.name, CFG_PATH.parent) - with ZipFile(LABELS_PATH, "r") as zip_ref: - zip_ref.extractall(OUT_DIR) - with ZipFile(DATA_PATH, "r") as zip_ref: - zip_ref.extractall(OUT_DIR / "coco/images") - - - args = get_cfg(cfg=DEFAULT_CFG) - args.data = str(CFG_PATH) - det_validator = det_model.task_map[det_model.task]["validator"](args=args) - det_validator.data = check_det_dataset(args.data) - det_validator.stride = 32 - det_data_loader = det_validator.get_dataloader(OUT_DIR / "coco", 1) - - det_validator.is_coco = True - det_validator.class_map = coco80_to_coco91_class() - det_validator.names = label_map - det_validator.metrics.names = det_validator.names - det_validator.nc = 80 - -Reuse validation dataloader in accuracy testing for quantization. For -that, it should be wrapped into the ``nncf.Dataset`` object and define a -transformation function for getting only input tensors. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import nncf - from typing import Dict - - - if not int8_model_det_path.exists(): - - - def transform_fn(data_item:Dict): - """ - Quantization transform function. Extracts and preprocess input data from dataloader item for quantization. - Parameters: - data_item: Dict with data item produced by DataLoader during iteration - Returns: - input_tensor: Input data for quantization - """ - input_tensor = det_validator.preprocess(data_item)['img'].numpy() - return input_tensor - - - quantization_dataset = nncf.Dataset(det_data_loader, transform_fn) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -The ``nncf.quantize`` function provides an interface for model -quantization. It requires an instance of the OpenVINO Model and -quantization dataset. Optionally, some additional parameters for the -configuration quantization process (number of samples for quantization, -preset, ignored scope, etc.) can be provided. Ultralytics modes contain -non-ReLU activation functions, which require asymmetric quantization of -activations. To achieve a better result, we will use a ``mixed`` -quantization preset. It provides symmetric quantization of weights and -asymmetric quantization of activations. For more accurate results, we -should keep the operation in the postprocessing subgraph in floating -point precision, using the ``ignored_scope`` parameter. - - **Note**: Model post-training quantization is time-consuming process. - Be patient, it can take several minutes depending on your hardware. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - if not int8_model_det_path.exists(): - ignored_scope = nncf.IgnoredScope( # post-processing - subgraphs=[ - nncf.Subgraph(inputs=[f"__module.model.{22 if 'v8' in DET_MODEL_NAME else 23}/aten::cat/Concat", - f"__module.model.{22 if 'v8' in DET_MODEL_NAME else 23}/aten::cat/Concat_1", - f"__module.model.{22 if 'v8' in DET_MODEL_NAME else 23}/aten::cat/Concat_2"], - outputs=[f"__module.model.{22 if 'v8' in DET_MODEL_NAME else 23}/aten::cat/Concat_7"]) - ] - ) - - # Detection model - quantized_det_model = nncf.quantize( - det_ov_model, - quantization_dataset, - preset=nncf.QuantizationPreset.MIXED, - ignored_scope=ignored_scope - ) - print(f"Quantized detection model will be saved to {int8_model_det_path}") - ov.save_model(quantized_det_model, str(int8_model_det_path)) - -Validate Quantized model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -``nncf.quantize`` returns the OpenVINO Model class instance, which is -suitable for loading on a device for making predictions. ``INT8`` model -input data and output result formats have no difference from the -floating point model representation. Therefore, we can reuse the same -``detect`` function defined above for getting the ``INT8`` model result -on the image. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - device - -.. code:: ipython3 - - %%skip not $to_quantize.value - - if quantized_det_model is None and int8_model_det_path.exists(): - quantized_det_model = core.read_model(int8_model_det_path) - - ov_config = {} - if device.value != "CPU": - quantized_det_model.reshape({0: [1, 3, 640, 640]}) - if "GPU" in device.value or ("AUTO" in device.value and "GPU" in core.available_devices): - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - quantized_det_compiled_model = core.compile_model(quantized_det_model, device.value, ov_config) - - - if det_model.predictor is None: - custom = {"conf": 0.25, "batch": 1, "save": False, "mode": "predict"} # method defaults - args = {**det_model.overrides, **custom} - det_model.predictor = det_model._smart_load("predictor")(overrides=args, _callbacks=det_model.callbacks) - det_model.predictor.setup_model(model=det_model.model) - - det_model.predictor.model.ov_compiled_model = det_compiled_model - - res = det_model(IMAGE_PATH) - display(Image.fromarray(res[0].plot()[:, :, ::-1])) - - -.. parsed-literal:: - - - image 1/1 /home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/yolov11-optimization/data/coco_bike.jpg: 640x640 1 bicycle, 2 cars, 1 dog, 14.2ms - Speed: 3.6ms preprocess, 14.2ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640) - - - -.. image:: yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_30_1.png - - -Compare the Original and Quantized Models ------------------------------------------ - - - -Compare performance object detection models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Finally, use the OpenVINO `Benchmark -Tool `__ -to measure the inference performance of the ``FP32`` and ``INT8`` -models. - - **Note**: For more accurate performance, it is recommended to run - ``benchmark_app`` in a terminal/command prompt after closing other - applications. Run - ``benchmark_app -m -d CPU -shape ""`` to - benchmark async inference on CPU on specific input data shape for one - minute. Change ``CPU`` to ``GPU`` to benchmark on GPU. Run - ``benchmark_app --help`` to see an overview of all command-line - options. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - device - -.. code:: ipython3 - - if int8_model_det_path.exists(): - # Inference FP32 model (OpenVINO IR) - !benchmark_app -m $det_model_path -d $device.value -api async -shape "[1,3,640,640]" -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ WARNING ] Default duration 120 seconds is used for unknown device AUTO - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.5.0-16814-e1c167a841c - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.5.0-16814-e1c167a841c - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 20.65 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [?,3,?,?] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_7) : f32 / [...] / [?,84,21..] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,640,640] - [ INFO ] Reshape model took 8.77 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_7) : f32 / [...] / [1,84,8400] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 524.58 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 120000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 29.31 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 20040 iterations - [ INFO ] Duration: 120081.07 ms - [ INFO ] Latency: - [ INFO ] Median: 68.90 ms - [ INFO ] Average: 71.71 ms - [ INFO ] Min: 47.48 ms - [ INFO ] Max: 162.55 ms - [ INFO ] Throughput: 166.89 FPS - - -.. code:: ipython3 - - if int8_model_det_path.exists(): - # Inference INT8 model (OpenVINO IR) - !benchmark_app -m $int8_model_det_path -d $device.value -api async -shape "[1,3,640,640]" -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.5.0-16814-e1c167a841c - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.5.0-16814-e1c167a841c - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 31.70 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_7) : f32 / [...] / [1,84,8400] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,640,640] - [ INFO ] Reshape model took 0.05 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_7) : f32 / [...] / [1,84,8400] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 729.93 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 18 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 18 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 18 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 18 inference requests, limits: 15000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 22.63 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 6210 iterations - [ INFO ] Duration: 15054.55 ms - [ INFO ] Latency: - [ INFO ] Median: 41.05 ms - [ INFO ] Average: 43.44 ms - [ INFO ] Min: 31.34 ms - [ INFO ] Max: 112.15 ms - [ INFO ] Throughput: 412.50 FPS - - -Next steps ----------- - -This section contains -suggestions on how to additionally improve the performance of your -application using OpenVINO. - -Async inference pipeline -~~~~~~~~~~~~~~~~~~~~~~~~ - -The key advantage of the Async -API is that when a device is busy with inference, the application can -perform other tasks in parallel (for example, populating inputs or -scheduling other requests) rather than wait for the current inference to -complete first. To understand how to perform async inference using -openvino, refer to `Async API tutorial `__ - -Live demo ---------- - - - -The following code runs model inference on a video: - -.. code:: ipython3 - - import collections - import time - from IPython import display - import cv2 - import numpy as np - - - # Main processing function to run object detection. - def run_object_detection( - source=0, - flip=False, - use_popup=False, - skip_first_frames=0, - model=det_model, - device=device.value, - video_width: int = None, # if not set the original size is used - ): - player = None - ov_config = {} - if device != "CPU": - model.reshape({0: [1, 3, 640, 640]}) - if "GPU" in device or ("AUTO" in device and "GPU" in core.available_devices): - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - compiled_model = core.compile_model(model, device, ov_config) - - if det_model.predictor is None: - custom = {"conf": 0.25, "batch": 1, "save": False, "mode": "predict"} # method defaults - args = {**det_model.overrides, **custom} - det_model.predictor = det_model._smart_load("predictor")(overrides=args, _callbacks=det_model.callbacks) - det_model.predictor.setup_model(model=det_model.model) - - det_model.predictor.model.ov_compiled_model = compiled_model - - try: - # Create a video player to play with target fps. - player = VideoPlayer(source=source, flip=flip, fps=30, skip_first_frames=skip_first_frames) - # Start capturing. - player.start() - if use_popup: - title = "Press ESC to Exit" - cv2.namedWindow(winname=title, flags=cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE) - - processing_times = collections.deque() - while True: - # Grab the frame. - frame = player.next() - if frame is None: - print("Source ended") - break - - if video_width: - # If the frame is larger than video_width, reduce size to improve the performance. - # If more, increase size for better demo expirience. - - scale = video_width / max(frame.shape) - frame = cv2.resize( - src=frame, - dsize=None, - fx=scale, - fy=scale, - interpolation=cv2.INTER_AREA, - ) - - # Get the results. - input_image = np.array(frame) - - start_time = time.time() - detections = det_model(input_image, verbose=False) - stop_time = time.time() - frame = detections[0].plot() - - processing_times.append(stop_time - start_time) - # Use processing times from last 200 frames. - if len(processing_times) > 200: - processing_times.popleft() - - _, f_width = frame.shape[:2] - # Mean processing time [ms]. - processing_time = np.mean(processing_times) * 1000 - fps = 1000 / processing_time - cv2.putText( - img=frame, - text=f"Inference time: {processing_time:.1f}ms ({fps:.1f} FPS)", - org=(20, 40), - fontFace=cv2.FONT_HERSHEY_COMPLEX, - fontScale=f_width / 1000, - color=(0, 0, 255), - thickness=1, - lineType=cv2.LINE_AA, - ) - # Use this workaround if there is flickering. - if use_popup: - cv2.imshow(winname=title, mat=frame) - key = cv2.waitKey(1) - # escape = 27 - if key == 27: - break - else: - # Encode numpy array to jpg. - _, encoded_img = cv2.imencode(ext=".jpg", img=frame, params=[cv2.IMWRITE_JPEG_QUALITY, 100]) - # Create an IPython image. - i = display.Image(data=encoded_img) - # Display the image in this notebook. - display.clear_output(wait=True) - display.display(i) - # ctrl-c - except KeyboardInterrupt: - print("Interrupted") - # any different error - except RuntimeError as e: - print(e) - finally: - if player is not None: - # Stop capturing. - player.stop() - if use_popup: - cv2.destroyAllWindows() - -Run Live Object Detection -~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Use a webcam as the video input. By default, the primary webcam is set -with ``source=0``. If you have multiple webcams, each one will be -assigned a consecutive number starting at 0. Set ``flip=True`` when -using a front-facing camera. Some web browsers, especially Mozilla -Firefox, may cause flickering. If you experience flickering, -set ``use_popup=True``. - - **NOTE**: To use this notebook with a webcam, you need to run the - notebook on a computer with a webcam. If you run the notebook on a - remote server (for example, in Binder or Google Colab service), the - webcam will not work. By default, the lower cell will run model - inference on a video file. If you want to try live inference on your - webcam set ``WEBCAM_INFERENCE = True`` - -Run the object detection: - -.. code:: ipython3 - - WEBCAM_INFERENCE = False - - if WEBCAM_INFERENCE: - VIDEO_SOURCE = 0 # Webcam - else: - VIDEO_SOURCE = "people.mp4" - if not Path(VIDEO_SOURCE).exists(): - download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/people.mp4", - "people.mp4", - ) - -.. code:: ipython3 - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - run_object_detection( - source=VIDEO_SOURCE, - flip=True, - use_popup=False, - model=det_ov_model, - device=device.value, - # video_width=1280, - ) - - - -.. image:: yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_43_0.png - - -.. parsed-literal:: - - Source ended - diff --git a/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_10_1.jpg b/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_10_1.jpg deleted file mode 100644 index bcf00f3d50dcb5..00000000000000 --- a/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_10_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cc502c95e9f5526befe662fcc79a6a91d4f5ff0cfce09165436e91f4ef8224ad -size 113613 diff --git a/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_10_1.png b/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_10_1.png deleted file mode 100644 index e886ccd29b47f9..00000000000000 --- a/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_10_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c7bb27d4264dbc22e0006e42fb2f1619992aeac7220232119578e42a0d8a083 -size 904352 diff --git a/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_17_1.jpg b/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_17_1.jpg deleted file mode 100644 index 7a742941b54c17..00000000000000 --- a/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_17_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:394895a09cc413a6343edb2a6f6242258701b6911c09d8405eacc7c27f6fac62 -size 110819 diff --git a/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_17_1.png b/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_17_1.png deleted file mode 100644 index 6d4bfe012b6187..00000000000000 --- a/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_17_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7b18cc2e60a802c059cd5cc05598a06c4affd959216e03674eeb68e822295843 -size 913173 diff --git a/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_30_1.jpg b/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_30_1.jpg deleted file mode 100644 index 7a742941b54c17..00000000000000 --- a/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_30_1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:394895a09cc413a6343edb2a6f6242258701b6911c09d8405eacc7c27f6fac62 -size 110819 diff --git a/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_30_1.png b/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_30_1.png deleted file mode 100644 index 6d4bfe012b6187..00000000000000 --- a/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_30_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7b18cc2e60a802c059cd5cc05598a06c4affd959216e03674eeb68e822295843 -size 913173 diff --git a/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_43_0.png b/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_43_0.png deleted file mode 100644 index c98997b2715d3d..00000000000000 --- a/docs/notebooks/yolov11-object-detection-with-output_files/yolov11-object-detection-with-output_43_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:baaca70dfb7cbc2d6a4ff9546ad9e8c052c7020d283c014e850a78f52bbf8b90 -size 572425 diff --git a/docs/notebooks/yolov11-quantization-with-accuracy-control-with-output.rst b/docs/notebooks/yolov11-quantization-with-accuracy-control-with-output.rst deleted file mode 100644 index 3ce952c6c71b8e..00000000000000 --- a/docs/notebooks/yolov11-quantization-with-accuracy-control-with-output.rst +++ /dev/null @@ -1,701 +0,0 @@ -Convert and Optimize YOLOv11 with OpenVINO™ -=========================================== - -The YOLOv11 algorithm developed by Ultralytics is a cutting-edge, -state-of-the-art (SOTA) model that is designed to be fast, accurate, and -easy to use, making it an excellent choice for a wide range of object -detection, image segmentation, and image classification tasks. More -details about its realization can be found in the original model -`repository `__. - -This tutorial demonstrates step-by-step instructions on how to run apply -quantization with accuracy control to PyTorch YOLOv11. The advanced -quantization flow allows to apply 8-bit quantization to the model with -control of accuracy metric. This is achieved by keeping the most -impactful operations within the model in the original precision. The -flow is based on the `Basic 8-bit -quantization `__ -and has the following differences: - -- Besides the calibration dataset, a validation dataset is required to - compute the accuracy metric. Both datasets can refer to the same data - in the simplest case. -- Validation function, used to compute accuracy metric is required. It - can be a function that is already available in the source framework - or a custom function. -- Since accuracy validation is run several times during the - quantization process, quantization with accuracy control can take - more time than the Basic 8-bit quantization flow. -- The resulted model can provide smaller performance improvement than - the Basic 8-bit quantization flow because some of the operations are - kept in the original precision. - -.. - - **NOTE**: Currently, 8-bit quantization with accuracy control in NNCF - is available only for models in OpenVINO representation. - -The steps for the quantization with accuracy control are described -below. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Get Pytorch model and OpenVINO IR - model <#get-pytorch-model-and-openvino-ir-model>`__ - - - `Define validator and data - loader <#define-validator-and-data-loader>`__ - - `Prepare calibration and validation - datasets <#prepare-calibration-and-validation-datasets>`__ - - `Prepare validation function <#prepare-validation-function>`__ - -- `Run quantization with accuracy - control <#run-quantization-with-accuracy-control>`__ -- `Compare Accuracy and Performance of the Original and Quantized - Models <#compare-accuracy-and-performance-of-the-original-and-quantized-models>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -^^^^^^^^^^^^^ - - - -Install necessary packages. - -.. code:: ipython3 - - %pip install -q "openvino>=2024.0.0" - %pip install -q "nncf>=2.9.0" - %pip install -q "ultralytics==8.3.59" tqdm --extra-index-url https://download.pytorch.org/whl/cpu - -Get Pytorch model and OpenVINO IR model ---------------------------------------- - - - -Generally, PyTorch models represent an instance of the -`torch.nn.Module `__ -class, initialized by a state dictionary with model weights. We will use -the YOLOv11 nano model (also known as ``yolo11n``) pre-trained on a COCO -dataset, which is available in this -`repo `__. Similar steps are -also applicable to other YOLOv11 models. Typical steps to obtain a -pre-trained model: - -1. Create an instance of a model class. -2. Load a checkpoint state dict, which contains the pre-trained model - weights. - -In this case, the creators of the model provide an API that enables -converting the YOLOv11 model to ONNX and then to OpenVINO IR. Therefore, -we do not need to do these steps manually. - -.. code:: ipython3 - - import os - from pathlib import Path - - from ultralytics import YOLO - from ultralytics.cfg import get_cfg - from ultralytics.data.utils import check_det_dataset - from ultralytics.engine.validator import BaseValidator as Validator - from ultralytics.utils import DEFAULT_CFG - from ultralytics.utils import ops - from ultralytics.utils.metrics import ConfusionMatrix - - ROOT = os.path.abspath("") - - MODEL_NAME = "yolo11n-seg" - - model = YOLO(f"{ROOT}/{MODEL_NAME}.pt") - args = get_cfg(cfg=DEFAULT_CFG) - args.data = "coco128-seg.yaml" - -.. code:: ipython3 - - # Fetch the notebook utils script from the openvino_notebooks repo - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("yolov11-quantization-with-accuracy-control.ipynb") - -.. code:: ipython3 - - from zipfile import ZipFile - - from ultralytics.data.utils import DATASETS_DIR - - DATA_URL = "https://www.ultralytics.com/assets/coco128-seg.zip" - CFG_URL = "https://raw.githubusercontent.com/ultralytics/ultralytics/v8.1.0/ultralytics/cfg/datasets/coco.yaml" - - OUT_DIR = DATASETS_DIR - - DATA_PATH = OUT_DIR / "coco128-seg.zip" - CFG_PATH = OUT_DIR / "coco128-seg.yaml" - - download_file(DATA_URL, DATA_PATH.name, DATA_PATH.parent) - download_file(CFG_URL, CFG_PATH.name, CFG_PATH.parent) - - if not (OUT_DIR / "coco128/labels").exists(): - with ZipFile(DATA_PATH, "r") as zip_ref: - zip_ref.extractall(OUT_DIR) - -Load model. - -.. code:: ipython3 - - import openvino as ov - - - model_path = Path(f"{ROOT}/{MODEL_NAME}_openvino_model/{MODEL_NAME}.xml") - if not model_path.exists(): - model.export(format="openvino", dynamic=True, half=False) - - ov_model = ov.Core().read_model(model_path) - - -.. parsed-literal:: - - Ultralytics 8.3.0 🚀 Python-3.10.12 torch-2.5.1+cpu CPU (Intel Core(TM) i9-10980XE 3.00GHz) - YOLO11n-seg summary (fused): 265 layers, 2,868,664 parameters, 0 gradients, 10.4 GFLOPs - - PyTorch: starting from '/home/maleksandr/test_notebooks/yolo/openvino_notebooks/notebooks/quantizing-model-with-accuracy-control/yolo11n-seg.pt' with input shape (1, 3, 640, 640) BCHW and output shape(s) ((1, 116, 8400), (1, 32, 160, 160)) (5.9 MB) - - OpenVINO: starting export with openvino 2024.4.0-16579-c3152d32c9c-releases/2024/4... - OpenVINO: export success ✅ 2.3s, saved as '/home/maleksandr/test_notebooks/yolo/openvino_notebooks/notebooks/quantizing-model-with-accuracy-control/yolo11n-seg_openvino_model/' (11.3 MB) - - Export complete (2.6s) - Results saved to /home/maleksandr/test_notebooks/yolo/openvino_notebooks/notebooks/quantizing-model-with-accuracy-control - Predict: yolo predict task=segment model=/home/maleksandr/test_notebooks/yolo/openvino_notebooks/notebooks/quantizing-model-with-accuracy-control/yolo11n-seg_openvino_model imgsz=640 - Validate: yolo val task=segment model=/home/maleksandr/test_notebooks/yolo/openvino_notebooks/notebooks/quantizing-model-with-accuracy-control/yolo11n-seg_openvino_model imgsz=640 data=/ultralytics/ultralytics/cfg/datasets/coco.yaml - Visualize: https://netron.app - - -Define validator and data loader -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -The original model repository uses a ``Validator`` wrapper, which -represents the accuracy validation pipeline. It creates dataloader and -evaluation metrics and updates metrics on each data batch produced by -the dataloader. Besides that, it is responsible for data preprocessing -and results postprocessing. For class initialization, the configuration -should be provided. We will use the default setup, but it can be -replaced with some parameters overriding to test on custom data. The -model has connected the ``ValidatorClass`` method, which creates a -validator class instance. - -.. code:: ipython3 - - from ultralytics.data.converter import coco80_to_coco91_class - - - validator = model.task_map[model.task]["validator"](args=args) - validator.data = check_det_dataset(args.data) - validator.stride = 3 - data_loader = validator.get_dataloader(OUT_DIR / "coco128-seg", 1) - - validator.is_coco = True - validator.class_map = coco80_to_coco91_class() - validator.names = model.model.names - validator.metrics.names = validator.names - validator.nc = model.model.model[-1].nc - validator.nm = 32 - validator.process = ops.process_mask - validator.plot_masks = [] - -Prepare calibration and validation datasets -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -We can use one dataset as calibration and validation datasets. Name it -``quantization_dataset``. - -.. code:: ipython3 - - from typing import Dict - - import nncf - - - def transform_fn(data_item: Dict): - input_tensor = validator.preprocess(data_item)["img"].numpy() - return input_tensor - - - quantization_dataset = nncf.Dataset(data_loader, transform_fn) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, openvino - - -Prepare validation function -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -.. code:: ipython3 - - from functools import partial - - import torch - from nncf.quantization.advanced_parameters import AdvancedAccuracyRestorerParameters - - - def validation_ac( - compiled_model: ov.CompiledModel, - validation_loader: torch.utils.data.DataLoader, - validator: Validator, - num_samples: int = None, - log=True, - ) -> float: - validator.seen = 0 - validator.jdict = [] - validator.stats = dict(tp_m=[], tp=[], conf=[], pred_cls=[], target_cls=[], target_img=[]) - validator.batch_i = 1 - validator.confusion_matrix = ConfusionMatrix(nc=validator.nc) - num_outputs = len(compiled_model.outputs) - - counter = 0 - for batch_i, batch in enumerate(validation_loader): - if num_samples is not None and batch_i == num_samples: - break - batch = validator.preprocess(batch) - results = compiled_model(batch["img"]) - if num_outputs == 1: - preds = torch.from_numpy(results[compiled_model.output(0)]) - else: - preds = [ - torch.from_numpy(results[compiled_model.output(0)]), - torch.from_numpy(results[compiled_model.output(1)]), - ] - preds = validator.postprocess(preds) - validator.update_metrics(preds, batch) - counter += 1 - stats = validator.get_stats() - if num_outputs == 1: - stats_metrics = stats["metrics/mAP50-95(B)"] - else: - stats_metrics = stats["metrics/mAP50-95(M)"] - if log: - print(f"Validate: dataset length = {counter}, metric value = {stats_metrics:.3f}") - - return stats_metrics - - - validation_fn = partial(validation_ac, validator=validator, log=False) - -Run quantization with accuracy control --------------------------------------- - - - -You should provide the calibration dataset and the validation dataset. -It can be the same dataset. - -- parameter ``max_drop`` defines the accuracy drop threshold. The quantization process stops when the - degradation of accuracy metric on the validation dataset is less than - the ``max_drop``. The default value is 0.01. NNCF will stop the - quantization and report an error if the ``max_drop`` value can’t be - reached. -- ``drop_type`` defines how the accuracy drop will be - calculated: ABSOLUTE (used by default) or RELATIVE. -- ``ranking_subset_size`` - size of a subset that is used to rank layers - by their contribution to the accuracy drop. Default value is 300, and - the more samples it has the better ranking, potentially. Here we use the - value 25 to speed up the execution. - - **NOTE**: Execution can take tens of minutes and requires up to 15 GB - of free memory - -.. code:: ipython3 - - quantized_model = nncf.quantize_with_accuracy_control( - ov_model, - quantization_dataset, - quantization_dataset, - validation_fn=validation_fn, - max_drop=0.01, - preset=nncf.QuantizationPreset.MIXED, - subset_size=128, - advanced_accuracy_restorer_parameters=AdvancedAccuracyRestorerParameters(ranking_subset_size=25), - ) - - - -.. parsed-literal:: - - Output() - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - -.. parsed-literal:: - - INFO:nncf:Validation of initial model was started - INFO:nncf:Elapsed Time: 00:00:00 - INFO:nncf:Elapsed Time: 00:00:04 - INFO:nncf:Metric of initial model: 0.39269183697877214 - INFO:nncf:Collecting values for each data item using the initial model - INFO:nncf:Elapsed Time: 00:00:04 - INFO:nncf:Validation of quantized model was started - INFO:nncf:Elapsed Time: 00:00:00 - INFO:nncf:Elapsed Time: 00:00:04 - INFO:nncf:Metric of quantized model: 0.37315412232099016 - INFO:nncf:Collecting values for each data item using the quantized model - INFO:nncf:Elapsed Time: 00:00:04 - INFO:nncf:Accuracy drop: 0.01953771465778198 (absolute) - INFO:nncf:Accuracy drop: 0.01953771465778198 (absolute) - INFO:nncf:Total number of quantized operations in the model: 127 - INFO:nncf:Number of parallel workers to rank quantized operations: 1 - INFO:nncf:ORIGINAL metric is used to rank quantizers - - - -.. parsed-literal:: - - Output() - - - - - - - - - -.. parsed-literal:: - - INFO:nncf:Elapsed Time: 00:02:11 - INFO:nncf:Changing the scope of quantizer nodes was started - INFO:nncf:Reverted 1 operations to the floating-point precision: - __module.model.23/aten::mul/Multiply_3 - INFO:nncf:Accuracy drop with the new quantization scope is 0.016468171203124993 (absolute) - INFO:nncf:Reverted 1 operations to the floating-point precision: - __module.model.0.conv/aten::_convolution/Convolution - INFO:nncf:Accuracy drop with the new quantization scope is 0.01663718104550027 (absolute) - INFO:nncf:Re-calculating ranking scores for remaining groups - - - -.. parsed-literal:: - - Output() - - - - - - - - - -.. parsed-literal:: - - INFO:nncf:Elapsed Time: 00:02:07 - INFO:nncf:Reverted 2 operations to the floating-point precision: - __module.model.23/aten::sub/Subtract_1 - __module.model.23/aten::add/Add_7 - INFO:nncf:Algorithm completed: achieved required accuracy drop 0.006520879829061188 (absolute) - INFO:nncf:3 out of 127 were reverted back to the floating-point precision: - __module.model.23/aten::mul/Multiply_3 - __module.model.23/aten::sub/Subtract_1 - __module.model.23/aten::add/Add_7 - - -Compare Accuracy and Performance of the Original and Quantized Models ---------------------------------------------------------------------- - - - -Now we can compare metrics of the Original non-quantized OpenVINO IR -model and Quantized OpenVINO IR model to make sure that the ``max_drop`` -is not exceeded. - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - core = ov.Core() - ov_config = {} - if device.value != "CPU": - quantized_model.reshape({0: [1, 3, 640, 640]}) - if "GPU" in device.value or ("AUTO" in device.value and "GPU" in core.available_devices): - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - quantized_compiled_model = core.compile_model(quantized_model, device.value, ov_config) - compiled_ov_model = core.compile_model(ov_model, device.value, ov_config) - - pt_result = validation_ac(compiled_ov_model, data_loader, validator) - quantized_result = validation_ac(quantized_compiled_model, data_loader, validator) - - - print(f"[Original OpenVINO]: {pt_result:.4f}") - print(f"[Quantized OpenVINO]: {quantized_result:.4f}") - - -.. parsed-literal:: - - Validate: dataset length = 128, metric value = 0.393 - Validate: dataset length = 128, metric value = 0.386 - [Original OpenVINO]: 0.3927 - [Quantized OpenVINO]: 0.3862 - - -And compare performance. - -.. code:: ipython3 - - from pathlib import Path - - # Set model directory - MODEL_DIR = Path("model") - MODEL_DIR.mkdir(exist_ok=True) - - ir_model_path = MODEL_DIR / "ir_model.xml" - quantized_model_path = MODEL_DIR / "quantized_model.xml" - - # Save models to use them in the commandline banchmark app - ov.save_model(ov_model, ir_model_path, compress_to_fp16=False) - ov.save_model(quantized_model, quantized_model_path, compress_to_fp16=False) - -.. code:: ipython3 - - # Inference Original model (OpenVINO IR) - ! benchmark_app -m $ir_model_path -shape "[1,3,640,640]" -d $device.value -api async - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ WARNING ] Default duration 120 seconds is used for unknown device AUTO - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 15.23 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [?,3,?,?] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_8) : f32 / [...] / [?,116,21..] - [ INFO ] input.255 (node: __module.model.23.cv4.2.1.act/aten::silu_/Swish_46) : f32 / [...] / [?,32,8..,8..] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,640,640] - [ INFO ] Reshape model took 7.93 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_8) : f32 / [...] / [1,116,8400] - [ INFO ] input.255 (node: __module.model.23.cv4.2.1.act/aten::silu_/Swish_46) : f32 / [...] / [1,32,160,160] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 389.72 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 120000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 45.30 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 17580 iterations - [ INFO ] Duration: 120085.24 ms - [ INFO ] Latency: - [ INFO ] Median: 80.74 ms - [ INFO ] Average: 81.80 ms - [ INFO ] Min: 59.61 ms - [ INFO ] Max: 151.88 ms - [ INFO ] Throughput: 146.40 FPS - - -.. code:: ipython3 - - # Inference Quantized model (OpenVINO IR) - ! benchmark_app -m $quantized_model_path -shape "[1,3,640,640]" -d $device.value -api async - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ WARNING ] Default duration 120 seconds is used for unknown device AUTO - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.4.0-16579-c3152d32c9c-releases/2024/4 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 25.39 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_8) : f32 / [...] / [1,116,8400] - [ INFO ] input.255 (node: __module.model.23.cv4.2.1.act/aten::silu_/Swish_46) : f32 / [...] / [1,32,160,160] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,640,640] - [ INFO ] Reshape model took 0.05 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.23/aten::cat/Concat_8) : f32 / [...] / [1,116,8400] - [ INFO ] input.255 (node: __module.model.23.cv4.2.1.act/aten::silu_/Swish_46) : f32 / [...] / [1,32,160,160] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 639.12 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 120000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 33.20 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 38424 iterations - [ INFO ] Duration: 120039.54 ms - [ INFO ] Latency: - [ INFO ] Median: 37.07 ms - [ INFO ] Average: 37.32 ms - [ INFO ] Min: 22.57 ms - [ INFO ] Max: 72.28 ms - [ INFO ] Throughput: 320.09 FPS - diff --git a/docs/notebooks/yolov8-instance-segmentation-with-output.rst b/docs/notebooks/yolov8-instance-segmentation-with-output.rst deleted file mode 100644 index f8b937b9b59d0b..00000000000000 --- a/docs/notebooks/yolov8-instance-segmentation-with-output.rst +++ /dev/null @@ -1,1364 +0,0 @@ -Convert and Optimize YOLOv8 instance segmentation model with OpenVINO™ -====================================================================== - -Instance segmentation goes a step further than object detection and -involves identifying individual objects in an image and segmenting them -from the rest of the image. Instance segmentation as an object detection -are often used as key components in computer vision systems. -Applications that use real-time instance segmentation models include -video analytics, robotics, autonomous vehicles, multi-object tracking -and object counting, medical image analysis, and many others. - -This tutorial demonstrates step-by-step instructions on how to run and -optimize PyTorch YOLOv8 with OpenVINO. We consider the steps required -for instance segmentation scenario. - -The tutorial consists of the following steps: - Prepare the PyTorch -model. - Download and prepare a dataset. - Validate the original model. -- Convert the PyTorch model to OpenVINO IR. - Validate the converted -model. - Prepare and run optimization pipeline. - Compare performance of -the FP32 and quantized models. - Compare accuracy of the FP32 and -quantized models. - Live demo - - -**Table of contents:** - - -- `Get PyTorch model <#get-pytorch-model>`__ - - - `Prerequisites <#prerequisites>`__ - -- `Instantiate model <#instantiate-model>`__ - - - `Convert model to OpenVINO IR <#convert-model-to-openvino-ir>`__ - - `Verify model inference <#verify-model-inference>`__ - - `Select inference device <#select-inference-device>`__ - - `Test on single image <#test-on-single-image>`__ - -- `Check model accuracy on the - dataset <#check-model-accuracy-on-the-dataset>`__ - - - `Download the validation - dataset <#download-the-validation-dataset>`__ - - `Define validation function <#define-validation-function>`__ - - `Configure Validator helper and create - DataLoader <#configure-validator-helper-and-create-dataloader>`__ - -- `Optimize model using NNCF Post-training Quantization - API <#optimize-model-using-nncf-post-training-quantization-api>`__ - - - `Validate Quantized model - inference <#validate-quantized-model-inference>`__ - -- `Compare the Original and Quantized - Models <#compare-the-original-and-quantized-models>`__ - - - `Compare performance of the Original and Quantized - Models <#compare-performance-of-the-original-and-quantized-models>`__ - - `Validate quantized model - accuracy <#validate-quantized-model-accuracy>`__ - -- `Other ways to optimize model <#other-ways-to-optimize-model>`__ -- `Live demo <#live-demo>`__ - - - `Run Live Object Detection and - Segmentation <#run-live-object-detection-and-segmentation>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Get PyTorch model ------------------ - - - -Generally, PyTorch models represent an instance of the -`torch.nn.Module `__ -class, initialized by a state dictionary with model weights. We will use -the YOLOv8 nano model (also known as ``yolov8n``) pre-trained on a COCO -dataset, which is available in this -`repo `__. Similar steps are -also applicable to other YOLOv8 models. Typical steps to obtain a -pre-trained model: 1. Create an instance of a model class. 2. Load a -checkpoint state dict, which contains the pre-trained model weights. 3. -Turn the model to evaluation for switching some operations to inference -mode. - -In this case, the creators of the model provide an API that enables -converting the YOLOv8 model to ONNX and then to OpenVINO IR. Therefore, -we do not need to do these steps manually. - -Prerequisites -^^^^^^^^^^^^^ - - - -Install necessary packages. - -.. code:: ipython3 - - %pip install -q "openvino>=2024.0.0" "nncf>=2.9.0" - %pip install -q "torch>=2.1" "torchvision>=0.16" "ultralytics==8.3.59" opencv-python tqdm --extra-index-url https://download.pytorch.org/whl/cpu - -Import required utility functions. The lower cell will download the -``notebook_utils`` Python module from GitHub. - -.. code:: ipython3 - - from pathlib import Path - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file, VideoPlayer, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("yolov8-instance-segmentation.ipynb") - -.. code:: ipython3 - - # Download a test sample - IMAGE_PATH = Path("./data/coco_bike.jpg") - if not IMAGE_PATH.exists(): - download_file( - url="https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bike.jpg", - filename=IMAGE_PATH.name, - directory=IMAGE_PATH.parent, - ) - - - -.. parsed-literal:: - - data/coco_bike.jpg: 0%| | 0.00/182k [00:00\ **Note**: Model evaluation is time consuming -process and can take several minutes, depending on the hardware. For -reducing calculation time, we define ``num_samples`` parameter with -evaluation subset size, but in this case, accuracy can be noncomparable -with originally reported by the authors of the model, due to validation -subset difference. *To validate the models on the full dataset set -``NUM_TEST_SAMPLES = None``.* - -.. code:: ipython3 - - NUM_TEST_SAMPLES = 300 - -.. code:: ipython3 - - fp_seg_stats = test(seg_ov_model, core, seg_data_loader, seg_validator, num_samples=NUM_TEST_SAMPLES) - - - -.. parsed-literal:: - - 0%| | 0/300 [00:00`__ provides a suite of -advanced algorithms for Neural Networks inference optimization in -OpenVINO with minimal accuracy drop. We will use 8-bit quantization in -post-training mode (without the fine-tuning pipeline) to optimize -YOLOv8. - -The optimization process contains the following steps: - -1. Create a Dataset for quantization. -2. Run ``nncf.quantize`` for getting an optimized model. -3. Serialize OpenVINO IR model, using the ``openvino.runtime.serialize`` - function. - -Please select below whether you would like to run quantization to -improve model inference speed. - -.. code:: ipython3 - - import ipywidgets as widgets - - int8_model_seg_path = models_dir / f"{SEG_MODEL_NAME}_openvino_int8_model/{SEG_MODEL_NAME}.xml" - - to_quantize = widgets.Checkbox( - value=True, - description="Quantization", - disabled=False, - ) - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -Let’s load ``skip magic`` extension to skip quantization if -``to_quantize`` is not selected - -.. code:: ipython3 - - # Fetch skip_kernel_extension module - import requests - - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - %load_ext skip_kernel_extension - -Reuse validation dataloader in accuracy testing for quantization. For -that, it should be wrapped into the ``nncf.Dataset`` object and define a -transformation function for getting only input tensors. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - - import nncf - from typing import Dict - - - def transform_fn(data_item:Dict): - """ - Quantization transform function. Extracts and preprocess input data from dataloader item for quantization. - Parameters: - data_item: Dict with data item produced by DataLoader during iteration - Returns: - input_tensor: Input data for quantization - """ - input_tensor = seg_validator.preprocess(data_item)['img'].numpy() - return input_tensor - - - quantization_dataset = nncf.Dataset(seg_data_loader, transform_fn) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino - - -The ``nncf.quantize`` function provides an interface for model -quantization. It requires an instance of the OpenVINO Model and -quantization dataset. Optionally, some additional parameters for the -configuration quantization process (number of samples for quantization, -preset, ignored scope, etc.) can be provided. YOLOv8 model contains -non-ReLU activation functions, which require asymmetric quantization of -activations. To achieve a better result, we will use a ``mixed`` -quantization preset. It provides symmetric quantization of weights and -asymmetric quantization of activations. For more accurate results, we -should keep the operation in the postprocessing subgraph in floating -point precision, using the ``ignored_scope`` parameter. - - **Note**: Model post-training quantization is time-consuming process. - Be patient, it can take several minutes depending on your hardware. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - ignored_scope = nncf.IgnoredScope( # post-processing - subgraphs=[ - nncf.Subgraph(inputs=['__module.model.22/aten::cat/Concat', - '__module.model.22/aten::cat/Concat_1', - '__module.model.22/aten::cat/Concat_2', - '__module.model.22/aten::cat/Concat_7'], - outputs=['__module.model.22/aten::cat/Concat_8']) - ] - ) - - # Segmentation model - quantized_seg_model = nncf.quantize( - seg_ov_model, - quantization_dataset, - preset=nncf.QuantizationPreset.MIXED, - ignored_scope=ignored_scope - ) - - -.. parsed-literal:: - - INFO:nncf:106 ignored nodes were found by subgraphs in the NNCFGraph - INFO:nncf:Not adding activation input quantizer for operation: 142 __module.model.22/aten::cat/Concat - INFO:nncf:Not adding activation input quantizer for operation: 151 __module.model.22/aten::view/Reshape_3 - INFO:nncf:Not adding activation input quantizer for operation: 270 __module.model.22/aten::cat/Concat_1 - INFO:nncf:Not adding activation input quantizer for operation: 281 __module.model.22/aten::view/Reshape_4 - INFO:nncf:Not adding activation input quantizer for operation: 336 __module.model.22/aten::cat/Concat_2 - INFO:nncf:Not adding activation input quantizer for operation: 339 __module.model.22/aten::view/Reshape_5 - INFO:nncf:Not adding activation input quantizer for operation: 152 __module.model.22/aten::cat/Concat_7 - INFO:nncf:Not adding activation input quantizer for operation: 163 __module.model.22/aten::cat/Concat_4 - INFO:nncf:Not adding activation input quantizer for operation: 176 __module.model.22/prim::ListUnpack - INFO:nncf:Not adding activation input quantizer for operation: 191 __module.model.22.dfl/aten::view/Reshape - INFO:nncf:Not adding activation input quantizer for operation: 192 __module.model.22/aten::sigmoid/Sigmoid - INFO:nncf:Not adding activation input quantizer for operation: 206 __module.model.22.dfl/aten::transpose/Transpose - INFO:nncf:Not adding activation input quantizer for operation: 217 __module.model.22.dfl/aten::softmax/Softmax - INFO:nncf:Not adding activation input quantizer for operation: 227 __module.model.22.dfl.conv/aten::_convolution/Convolution - INFO:nncf:Not adding activation input quantizer for operation: 235 __module.model.22.dfl/aten::view/Reshape_1 - INFO:nncf:Not adding activation input quantizer for operation: 245 __module.model.22/prim::ListUnpack/VariadicSplit - INFO:nncf:Not adding activation input quantizer for operation: 254 __module.model.22/aten::sub/Subtract - INFO:nncf:Not adding activation input quantizer for operation: 255 __module.model.22/aten::add/Add_6 - INFO:nncf:Not adding activation input quantizer for operation: 265 __module.model.22/aten::add/Add_7 - 275 __module.model.22/aten::div/Divide - - INFO:nncf:Not adding activation input quantizer for operation: 266 __module.model.22/aten::sub/Subtract_1 - INFO:nncf:Not adding activation input quantizer for operation: 276 __module.model.22/aten::cat/Concat_5 - INFO:nncf:Not adding activation input quantizer for operation: 242 __module.model.22/aten::mul/Multiply_3 - INFO:nncf:Not adding activation input quantizer for operation: 164 __module.model.22/aten::cat/Concat_8 - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - print(f"Quantized segmentation model will be saved to {int8_model_seg_path}") - ov.save_model(quantized_seg_model, str(int8_model_seg_path)) - - -.. parsed-literal:: - - Quantized segmentation model will be saved to models/yolov8n-seg_openvino_int8_model/yolov8n-seg.xml - - -Validate Quantized model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -``nncf.quantize`` returns the OpenVINO Model class instance, which is -suitable for loading on a device for making predictions. ``INT8`` model -input data and output result formats have no difference from the -floating point model representation. Therefore, we can reuse the same -``detect`` function defined above for getting the ``INT8`` model result -on the image. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - device - -.. code:: ipython3 - - %%skip not $to_quantize.value - - ov_config = {} - if device.value != "CPU": - quantized_seg_model.reshape({0: [1, 3, 640, 640]}) - if "GPU" in device.value or ("AUTO" in device.value and "GPU" in core.available_devices): - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - - quantized_seg_compiled_model = core.compile_model(quantized_seg_model, device.value, ov_config) - -.. code:: ipython3 - - %%skip not $to_quantize.value - - - def infer(*args): - result = quantized_seg_compiled_model(args) - return torch.from_numpy(result[0]), torch.from_numpy(result[1]) - - seg_model.predictor.inference = infer - -.. code:: ipython3 - - %%skip not $to_quantize.value - - res = seg_model(IMAGE_PATH) - display(Image.fromarray(res[0].plot()[:, :, ::-1])) - - -.. parsed-literal:: - - - image 1/1 /home/akash/intel/openvino_notebooks/notebooks/yolov8-optimization/data/coco_bike.jpg: 640x640 1 bicycle, 2 cars, 1 dog, 20.0ms - Speed: 4.5ms preprocess, 20.0ms inference, 16.9ms postprocess per image at shape (1, 3, 640, 640) - - - -.. image:: yolov8-instance-segmentation-with-output_files/yolov8-instance-segmentation-with-output_46_1.png - - -Compare the Original and Quantized Models ------------------------------------------ - - - -Compare performance of the Original and Quantized Models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Finally, use the OpenVINO -`Benchmark -Tool `__ -to measure the inference performance of the ``FP32`` and ``INT8`` -models. - - **Note**: For more accurate performance, it is recommended to run - ``benchmark_app`` in a terminal/command prompt after closing other - applications. Run - ``benchmark_app -m -d CPU -shape ""`` to - benchmark async inference on CPU on specific input data shape for one - minute. Change ``CPU`` to ``GPU`` to benchmark on GPU. Run - ``benchmark_app --help`` to see an overview of all command-line - options. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - device - -.. code:: ipython3 - - if int8_model_seg_path.exists(): - !benchmark_app -m $seg_model_path -d $device.value -api async -shape "[1,3,640,640]" -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.3.0-16041-1e3b88e4e3f-releases/2024/3 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.3.0-16041-1e3b88e4e3f-releases/2024/3 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 14.75 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [?,3,?,?] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.22/aten::cat/Concat_8) : f32 / [...] / [?,116,21..] - [ INFO ] input.199 (node: __module.model.22.cv4.2.1.act/aten::silu_/Swish_37) : f32 / [...] / [?,32,8..,8..] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,640,640] - [ INFO ] Reshape model took 7.87 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.22/aten::cat/Concat_8) : f32 / [...] / [1,116,8400] - [ INFO ] input.199 (node: __module.model.22.cv4.2.1.act/aten::silu_/Swish_37) : f32 / [...] / [1,32,160,160] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 428.56 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 15000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 47.21 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 1704 iterations - [ INFO ] Duration: 15155.38 ms - [ INFO ] Latency: - [ INFO ] Median: 106.55 ms - [ INFO ] Average: 106.35 ms - [ INFO ] Min: 62.73 ms - [ INFO ] Max: 156.70 ms - [ INFO ] Throughput: 112.44 FPS - - -.. code:: ipython3 - - if int8_model_seg_path.exists(): - !benchmark_app -m $int8_model_seg_path -d $device.value -api async -shape "[1,3,640,640]" -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.3.0-16041-1e3b88e4e3f-releases/2024/3 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.3.0-16041-1e3b88e4e3f-releases/2024/3 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 44.84 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,3,?,?] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.22/aten::cat/Concat_8) : f32 / [...] / [1,116,21..] - [ INFO ] input.199 (node: __module.model.22.cv4.2.1.act/aten::silu_/Swish_37) : f32 / [...] / [1,32,8..,8..] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,640,640] - [ INFO ] Reshape model took 15.82 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.22/aten::cat/Concat_8) : f32 / [...] / [1,116,8400] - [ INFO ] input.199 (node: __module.model.22.cv4.2.1.act/aten::silu_/Swish_37) : f32 / [...] / [1,32,160,160] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 622.28 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 15000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 31.05 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 4056 iterations - [ INFO ] Duration: 15056.34 ms - [ INFO ] Latency: - [ INFO ] Median: 44.52 ms - [ INFO ] Average: 44.37 ms - [ INFO ] Min: 29.32 ms - [ INFO ] Max: 64.98 ms - [ INFO ] Throughput: 269.39 FPS - - -Validate quantized model accuracy -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -As we can see, there is no significant difference between ``INT8`` and -float model result in a single image test. To understand how -quantization influences model prediction precision, we can compare model -accuracy on a dataset. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - int8_seg_stats = test(quantized_seg_model, core, seg_data_loader, seg_validator, num_samples=NUM_TEST_SAMPLES) - - - -.. parsed-literal:: - - 0%| | 0/300 [00:00`__ - -Preprocessing API enables making preprocessing a part of the model -reducing application code and dependency on additional image processing -libraries. The main advantage of Preprocessing API is that preprocessing -steps will be integrated into the execution graph and will be performed -on a selected device (CPU/GPU etc.) rather than always being executed on -CPU as part of an application. This will also improve selected device -utilization. For more information, refer to the overview of -`Preprocessing API -tutorial `__. To -see, how it could be used with YOLOV8 object detection model , please, -see `Convert and Optimize YOLOv8 real-time object detection with -OpenVINO tutorial `__ - -Live demo ---------- - - - -The following code runs model inference on a video: - -.. code:: ipython3 - - import collections - import time - import cv2 - from IPython import display - - - def run_instance_segmentation( - source=0, - flip=False, - use_popup=False, - skip_first_frames=0, - model=seg_model, - device=device.value, - ): - player = None - - ov_config = {} - if device != "CPU": - model.reshape({0: [1, 3, 640, 640]}) - if "GPU" in device or ("AUTO" in device and "GPU" in core.available_devices): - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - compiled_model = core.compile_model(model, device, ov_config) - - def infer(*args): - result = compiled_model(args) - return torch.from_numpy(result[0]), torch.from_numpy(result[1]) - - seg_model.predictor.inference = infer - - try: - # Create a video player to play with target fps. - player = VideoPlayer(source=source, flip=flip, fps=30, skip_first_frames=skip_first_frames) - # Start capturing. - player.start() - if use_popup: - title = "Press ESC to Exit" - cv2.namedWindow(winname=title, flags=cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE) - - processing_times = collections.deque() - while True: - # Grab the frame. - frame = player.next() - if frame is None: - print("Source ended") - break - # If the frame is larger than full HD, reduce size to improve the performance. - scale = 1280 / max(frame.shape) - if scale < 1: - frame = cv2.resize( - src=frame, - dsize=None, - fx=scale, - fy=scale, - interpolation=cv2.INTER_AREA, - ) - # Get the results. - input_image = np.array(frame) - - start_time = time.time() - detections = seg_model(input_image) - stop_time = time.time() - frame = detections[0].plot() - - processing_times.append(stop_time - start_time) - # Use processing times from last 200 frames. - if len(processing_times) > 200: - processing_times.popleft() - - _, f_width = frame.shape[:2] - # Mean processing time [ms]. - processing_time = np.mean(processing_times) * 1000 - fps = 1000 / processing_time - cv2.putText( - img=frame, - text=f"Inference time: {processing_time:.1f}ms ({fps:.1f} FPS)", - org=(20, 40), - fontFace=cv2.FONT_HERSHEY_COMPLEX, - fontScale=f_width / 1000, - color=(0, 0, 255), - thickness=1, - lineType=cv2.LINE_AA, - ) - # Use this workaround if there is flickering. - if use_popup: - cv2.imshow(winname=title, mat=frame) - key = cv2.waitKey(1) - # escape = 27 - if key == 27: - break - else: - # Encode numpy array to jpg. - _, encoded_img = cv2.imencode(ext=".jpg", img=frame, params=[cv2.IMWRITE_JPEG_QUALITY, 100]) - # Create an IPython image. - i = display.Image(data=encoded_img) - # Display the image in this notebook. - display.clear_output(wait=True) - display.display(i) - # ctrl-c - except KeyboardInterrupt: - print("Interrupted") - # any different error - except RuntimeError as e: - print(e) - finally: - if player is not None: - # Stop capturing. - player.stop() - if use_popup: - cv2.destroyAllWindows() - -Run Live Object Detection and Segmentation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Use a webcam as the video input. By default, the primary webcam is set -with \ ``source=0``. If you have multiple webcams, each one will be -assigned a consecutive number starting at 0. Set \ ``flip=True`` when -using a front-facing camera. Some web browsers, especially Mozilla -Firefox, may cause flickering. If you experience flickering, -set \ ``use_popup=True``. - - **NOTE**: To use this notebook with a webcam, you need to run the - notebook on a computer with a webcam. If you run the notebook on a - remote server (for example, in Binder or Google Colab service), the - webcam will not work. By default, the lower cell will run model - inference on a video file. If you want to try live inference on your - webcam set ``WEBCAM_INFERENCE = True`` - -.. code:: ipython3 - - WEBCAM_INFERENCE = False - - if WEBCAM_INFERENCE: - VIDEO_SOURCE = 0 # Webcam - else: - VIDEO_URL = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/people.mp4" - VIDEO_SOURCE = "people.mp4" - if not Path(VIDEO_SOURCE).exists(): - download_file(VIDEO_URL) - -.. code:: ipython3 - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - run_instance_segmentation( - source=VIDEO_SOURCE, - flip=True, - use_popup=False, - model=seg_ov_model, - device=device.value, - ) - - - -.. image:: yolov8-instance-segmentation-with-output_files/yolov8-instance-segmentation-with-output_62_0.png - - -.. parsed-literal:: - - Source ended - diff --git a/docs/notebooks/yolov8-instance-segmentation-with-output_files/yolov8-instance-segmentation-with-output_18_1.png b/docs/notebooks/yolov8-instance-segmentation-with-output_files/yolov8-instance-segmentation-with-output_18_1.png deleted file mode 100644 index 47cd8e5b2bd463..00000000000000 --- a/docs/notebooks/yolov8-instance-segmentation-with-output_files/yolov8-instance-segmentation-with-output_18_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e8d5301d9b17bda455dc4750afbd834c811f59292e8cc62ebde3f0eb051bfd3c -size 733019 diff --git a/docs/notebooks/yolov8-instance-segmentation-with-output_files/yolov8-instance-segmentation-with-output_46_1.png b/docs/notebooks/yolov8-instance-segmentation-with-output_files/yolov8-instance-segmentation-with-output_46_1.png deleted file mode 100644 index f4de86fdfe33a1..00000000000000 --- a/docs/notebooks/yolov8-instance-segmentation-with-output_files/yolov8-instance-segmentation-with-output_46_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9501d2d8d10dd1282127a4d8c79c862db51565c2fc1956388618d77f118b473a -size 732622 diff --git a/docs/notebooks/yolov8-instance-segmentation-with-output_files/yolov8-instance-segmentation-with-output_62_0.png b/docs/notebooks/yolov8-instance-segmentation-with-output_files/yolov8-instance-segmentation-with-output_62_0.png deleted file mode 100644 index 24465088c13adb..00000000000000 --- a/docs/notebooks/yolov8-instance-segmentation-with-output_files/yolov8-instance-segmentation-with-output_62_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:05c152a0b21e9614dd8c88700d52154fb7eb6f0c2a9ab657f551bfc6dafa35b4 -size 473426 diff --git a/docs/notebooks/yolov8-instance-segmentation-with-output_files/yolov8-instance-segmentation-with-output_9_3.png b/docs/notebooks/yolov8-instance-segmentation-with-output_files/yolov8-instance-segmentation-with-output_9_3.png deleted file mode 100644 index e71abddeb8df6f..00000000000000 --- a/docs/notebooks/yolov8-instance-segmentation-with-output_files/yolov8-instance-segmentation-with-output_9_3.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3389035405c43a594d8915b601b27bb628ebe5de072447ddd8949888a92363a4 -size 732515 diff --git a/docs/notebooks/yolov8-keypoint-detection-with-output.rst b/docs/notebooks/yolov8-keypoint-detection-with-output.rst deleted file mode 100644 index 91c63a36ecbd29..00000000000000 --- a/docs/notebooks/yolov8-keypoint-detection-with-output.rst +++ /dev/null @@ -1,1318 +0,0 @@ -Convert and Optimize YOLOv8 keypoint detection model with OpenVINO™ -=================================================================== - -Keypoint detection/Pose is a task that involves detecting specific -points in an image or video frame. These points are referred to as -keypoints and are used to track movement or pose estimation. YOLOv8 can -detect keypoints in an image or video frame with high accuracy and -speed. - -This tutorial demonstrates step-by-step instructions on how to run and -optimize `PyTorch YOLOv8 Pose -model `__ with OpenVINO. We -consider the steps required for keypoint detection scenario. - -The tutorial consists of the following steps: - Prepare the PyTorch -model. - Download and prepare a dataset. - Validate the original model. -- Convert the PyTorch model to OpenVINO IR. - Validate the converted -model. - Prepare and run optimization pipeline. - Compare performance of -the FP32 and quantized models. - Compare accuracy of the FP32 and -quantized models. - Live demo - - -**Table of contents:** - - -- `Get PyTorch model <#get-pytorch-model>`__ - - - `Prerequisites <#prerequisites>`__ - -- `Instantiate model <#instantiate-model>`__ - - - `Convert model to OpenVINO IR <#convert-model-to-openvino-ir>`__ - - `Verify model inference <#verify-model-inference>`__ - - `Select inference device <#select-inference-device>`__ - - `Test on single image <#test-on-single-image>`__ - -- `Check model accuracy on the - dataset <#check-model-accuracy-on-the-dataset>`__ - - - `Download the validation - dataset <#download-the-validation-dataset>`__ - - `Define validation function <#define-validation-function>`__ - - `Configure Validator helper and create - DataLoader <#configure-validator-helper-and-create-dataloader>`__ - -- `Optimize model using NNCF Post-training Quantization - API <#optimize-model-using-nncf-post-training-quantization-api>`__ - - - `Validate Quantized model - inference <#validate-quantized-model-inference>`__ - -- `Compare the Original and Quantized - Models <#compare-the-original-and-quantized-models>`__ - - - `Compare performance of the Original and Quantized - Models <#compare-performance-of-the-original-and-quantized-models>`__ - - `Compare accuracy of the Original and Quantized - Models <#compare-accuracy-of-the-original-and-quantized-models>`__ - -- `Other ways to optimize model <#other-ways-to-optimize-model>`__ -- `Live demo <#live-demo>`__ - - - `Run Keypoint Detection on - video <#run-keypoint-detection-on-video>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Get PyTorch model ------------------ - - - -Generally, PyTorch models represent an instance of the -`torch.nn.Module `__ -class, initialized by a state dictionary with model weights. We will use -the YOLOv8 nano model (also known as ``yolov8n``) pre-trained on a COCO -dataset, which is available in this -`repo `__. Similar steps are -also applicable to other YOLOv8 models. Typical steps to obtain a -pre-trained model: 1. Create an instance of a model class. 2. Load a -checkpoint state dict, which contains the pre-trained model weights. 3. -Turn the model to evaluation for switching some operations to inference -mode. - -In this case, the creators of the model provide an API that enables -converting the YOLOv8 model to OpenVINO IR. Therefore, we do not need to -do these steps manually. - -Prerequisites -^^^^^^^^^^^^^ - - - -Install necessary packages. - -.. code:: ipython3 - - %pip install -q "openvino>=2024.0.0" "nncf>=2.9.0" - %pip install -q "protobuf==3.20.*" "torch>=2.1" "torchvision>=0.16" "ultralytics==8.3.59" tqdm opencv-python --extra-index-url https://download.pytorch.org/whl/cpu - -Import required utility functions. The lower cell will download the -``notebook_utils`` Python module from GitHub. - -.. code:: ipython3 - - from pathlib import Path - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - from notebook_utils import download_file, VideoPlayer, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("yolov8-keypoint-detection.ipynb") - -.. code:: ipython3 - - # Download a test sample - IMAGE_PATH = Path("./data/intel_rnb.jpg") - if not IMAGE_PATH.exists(): - download_file( - url="https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/intel_rnb.jpg", - filename=IMAGE_PATH.name, - directory=IMAGE_PATH.parent, - ) - - - -.. parsed-literal:: - - data/intel_rnb.jpg: 0%| | 0.00/288k [00:00\ **Note**: Model evaluation is time consuming -process and can take several minutes, depending on the hardware. For -reducing calculation time, we define ``num_samples`` parameter with -evaluation subset size, but in this case, accuracy can be noncomparable -with originally reported by the authors of the model, due to validation -subset difference. *To validate the models on the full dataset set -``NUM_TEST_SAMPLES = None``.* - -.. code:: ipython3 - - NUM_TEST_SAMPLES = 300 - -.. code:: ipython3 - - fp_pose_stats = test(pose_ov_model, core, pose_data_loader, pose_validator, num_samples=NUM_TEST_SAMPLES) - - - -.. parsed-literal:: - - 0%| | 0/300 [00:00`__ provides a suite of -advanced algorithms for Neural Networks inference optimization in -OpenVINO with minimal accuracy drop. We will use 8-bit quantization in -post-training mode (without the fine-tuning pipeline) to optimize -YOLOv8. - -The optimization process contains the following steps: - -1. Create a Dataset for quantization. -2. Run ``nncf.quantize`` for getting an optimized model. -3. Serialize OpenVINO IR model, using the ``openvino.runtime.serialize`` - function. - -Please select below whether you would like to run quantization to -improve model inference speed. - -.. code:: ipython3 - - import ipywidgets as widgets - - int8_model_pose_path = models_dir / f"{POSE_MODEL_NAME}_openvino_int8_model/{POSE_MODEL_NAME}.xml" - - to_quantize = widgets.Checkbox( - value=True, - description="Quantization", - disabled=False, - ) - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -Let’s load ``skip magic`` extension to skip quantization if -``to_quantize`` is not selected - -.. code:: ipython3 - - # Fetch skip_kernel_extension module - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - %load_ext skip_kernel_extension - -Reuse validation dataloader in accuracy testing for quantization. For -that, it should be wrapped into the ``nncf.Dataset`` object and define a -transformation function for getting only input tensors. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import nncf - from typing import Dict - - - def transform_fn(data_item:Dict): - """ - Quantization transform function. Extracts and preprocess input data from dataloader item for quantization. - Parameters: - data_item: Dict with data item produced by DataLoader during iteration - Returns: - input_tensor: Input data for quantization - """ - input_tensor = pose_validator.preprocess(data_item)['img'].numpy() - return input_tensor - - - quantization_dataset = nncf.Dataset(pose_data_loader, transform_fn) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino - - -The ``nncf.quantize`` function provides an interface for model -quantization. It requires an instance of the OpenVINO Model and -quantization dataset. Optionally, some additional parameters for the -configuration quantization process (number of samples for quantization, -preset, ignored scope, etc.) can be provided. YOLOv8 model contains -non-ReLU activation functions, which require asymmetric quantization of -activations. To achieve a better result, we will use a ``mixed`` -quantization preset. It provides symmetric quantization of weights and -asymmetric quantization of activations. For more accurate results, we -should keep the operation in the postprocessing subgraph in floating -point precision, using the ``ignored_scope`` parameter. - - **Note**: Model post-training quantization is time-consuming process. - Be patient, it can take several minutes depending on your hardware. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - ignored_scope = nncf.IgnoredScope( # post-processing - subgraphs=[ - nncf.Subgraph(inputs=['__module.model.22/aten::cat/Concat', - '__module.model.22/aten::cat/Concat_1', - '__module.model.22/aten::cat/Concat_2', - '__module.model.22/aten::cat/Concat_7'], - outputs=['__module.model.22/aten::cat/Concat_9']) - ] - ) - - # Detection model - quantized_pose_model = nncf.quantize( - pose_ov_model, - quantization_dataset, - preset=nncf.QuantizationPreset.MIXED, - ignored_scope=ignored_scope - ) - - -.. parsed-literal:: - - INFO:nncf:116 ignored nodes were found by subgraphs in the NNCFGraph - INFO:nncf:Not adding activation input quantizer for operation: 134 __module.model.22/aten::cat/Concat - INFO:nncf:Not adding activation input quantizer for operation: 142 __module.model.22/aten::view/Reshape_3 - INFO:nncf:Not adding activation input quantizer for operation: 267 __module.model.22/aten::cat/Concat_1 - INFO:nncf:Not adding activation input quantizer for operation: 279 __module.model.22/aten::view/Reshape_4 - INFO:nncf:Not adding activation input quantizer for operation: 334 __module.model.22/aten::cat/Concat_2 - INFO:nncf:Not adding activation input quantizer for operation: 337 __module.model.22/aten::view/Reshape_5 - INFO:nncf:Not adding activation input quantizer for operation: 143 __module.model.22/aten::cat/Concat_7 - INFO:nncf:Not adding activation input quantizer for operation: 154 __module.model.22/aten::view/Reshape_9 - INFO:nncf:Not adding activation input quantizer for operation: 166 __module.model.22/aten::slice/Slice_2 - INFO:nncf:Not adding activation input quantizer for operation: 167 __module.model.22/aten::slice/Slice_5 - INFO:nncf:Not adding activation input quantizer for operation: 182 __module.model.22/aten::mul/Multiply_4 - 199 __module.model.22/aten::add/Add_8 - - INFO:nncf:Not adding activation input quantizer for operation: 183 __module.model.22/aten::sigmoid/Sigmoid_1 - INFO:nncf:Not adding activation input quantizer for operation: 213 __module.model.22/aten::mul/Multiply_5 - INFO:nncf:Not adding activation input quantizer for operation: 200 __module.model.22/aten::cat/Concat_8 - INFO:nncf:Not adding activation input quantizer for operation: 214 __module.model.22/aten::view/Reshape_10 - INFO:nncf:Not adding activation input quantizer for operation: 153 __module.model.22/aten::cat/Concat_4 - INFO:nncf:Not adding activation input quantizer for operation: 165 __module.model.22/prim::ListUnpack - INFO:nncf:Not adding activation input quantizer for operation: 180 __module.model.22.dfl/aten::view/Reshape - INFO:nncf:Not adding activation input quantizer for operation: 181 __module.model.22/aten::sigmoid/Sigmoid - INFO:nncf:Not adding activation input quantizer for operation: 197 __module.model.22.dfl/aten::transpose/Transpose - INFO:nncf:Not adding activation input quantizer for operation: 211 __module.model.22.dfl/aten::softmax/Softmax - INFO:nncf:Not adding activation input quantizer for operation: 224 __module.model.22.dfl.conv/aten::_convolution/Convolution - INFO:nncf:Not adding activation input quantizer for operation: 232 __module.model.22.dfl/aten::view/Reshape_1 - INFO:nncf:Not adding activation input quantizer for operation: 242 __module.model.22/prim::ListUnpack/VariadicSplit - INFO:nncf:Not adding activation input quantizer for operation: 251 __module.model.22/aten::sub/Subtract - INFO:nncf:Not adding activation input quantizer for operation: 252 __module.model.22/aten::add/Add_6 - INFO:nncf:Not adding activation input quantizer for operation: 262 __module.model.22/aten::add/Add_7 - 273 __module.model.22/aten::div/Divide - - INFO:nncf:Not adding activation input quantizer for operation: 263 __module.model.22/aten::sub/Subtract_1 - INFO:nncf:Not adding activation input quantizer for operation: 274 __module.model.22/aten::cat/Concat_5 - INFO:nncf:Not adding activation input quantizer for operation: 239 __module.model.22/aten::mul/Multiply_3 - INFO:nncf:Not adding activation input quantizer for operation: 198 __module.model.22/aten::cat/Concat_9 - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - print(f"Quantized keypoint detection model will be saved to {int8_model_pose_path}") - ov.save_model(quantized_pose_model, str(int8_model_pose_path)) - - -.. parsed-literal:: - - Quantized keypoint detection model will be saved to models/yolov8n-pose_openvino_int8_model/yolov8n-pose.xml - - -Validate Quantized model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -``nncf.quantize`` returns the OpenVINO Model class instance, which is -suitable for loading on a device for making predictions. ``INT8`` model -input data and output result formats have no difference from the -floating point model representation. Therefore, we can reuse the same -``detect`` function defined above for getting the ``INT8`` model result -on the image. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - device - -.. code:: ipython3 - - %%skip not $to_quantize.value - - ov_config = {} - if device.value != "CPU": - quantized_pose_model.reshape({0: [1, 3, 640, 640]}) - if "GPU" in device.value or ("AUTO" in device.value and "GPU" in core.available_devices): - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - quantized_pose_compiled_model = core.compile_model(quantized_pose_model, device.value, ov_config) - - def infer(*args): - result = quantized_pose_compiled_model(args) - return torch.from_numpy(result[0]) - - pose_model.predictor.inference = infer - - res = pose_model(IMAGE_PATH) - display(Image.fromarray(res[0].plot()[:, :, ::-1])) - - -.. parsed-literal:: - - - image 1/1 /home/akash/intel/openvino_notebooks/notebooks/yolov8-optimization/data/intel_rnb.jpg: 640x640 2 persons, 11.1ms - Speed: 3.8ms preprocess, 11.1ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640) - - - -.. image:: yolov8-keypoint-detection-with-output_files/yolov8-keypoint-detection-with-output_44_1.png - - -Compare the Original and Quantized Models ------------------------------------------ - - - -Compare performance of the Original and Quantized Models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Finally, use the OpenVINO -`Benchmark -Tool `__ -to measure the inference performance of the ``FP32`` and ``INT8`` -models. - - **Note**: For more accurate performance, it is recommended to run - ``benchmark_app`` in a terminal/command prompt after closing other - applications. Run - ``benchmark_app -m -d CPU -shape ""`` to - benchmark async inference on CPU on specific input data shape for one - minute. Change ``CPU`` to ``GPU`` to benchmark on GPU. Run - ``benchmark_app --help`` to see an overview of all command-line - options. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - device - -.. code:: ipython3 - - if int8_model_pose_path.exists(): - # Inference FP32 model (OpenVINO IR) - !benchmark_app -m $pose_model_path -d $device.value -api async -shape "[1,3,640,640]" - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ WARNING ] Default duration 120 seconds is used for unknown device AUTO - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.3.0-16041-1e3b88e4e3f-releases/2024/3 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.3.0-16041-1e3b88e4e3f-releases/2024/3 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 30.04 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [?,3,?,?] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.22/aten::cat/Concat_9) : f32 / [...] / [?,56,21..] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,640,640] - [ INFO ] Reshape model took 8.90 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.22/aten::cat/Concat_9) : f32 / [...] / [1,56,8400] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 366.94 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 120000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 41.78 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 16740 iterations - [ INFO ] Duration: 120117.71 ms - [ INFO ] Latency: - [ INFO ] Median: 85.71 ms - [ INFO ] Average: 85.92 ms - [ INFO ] Min: 64.95 ms - [ INFO ] Max: 115.78 ms - [ INFO ] Throughput: 139.36 FPS - - -.. code:: ipython3 - - if int8_model_pose_path.exists(): - # Inference INT8 model (OpenVINO IR) - !benchmark_app -m $int8_model_pose_path -d $device.value -api async -shape "[1,3,640,640]" -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.3.0-16041-1e3b88e4e3f-releases/2024/3 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.3.0-16041-1e3b88e4e3f-releases/2024/3 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 20.82 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,3,?,?] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.22/aten::cat/Concat_9) : f32 / [...] / [1,56,21..] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,640,640] - [ INFO ] Reshape model took 11.11 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.22/aten::cat/Concat_9) : f32 / [...] / [1,56,8400] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 567.45 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 18 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 18 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 18 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 18 inference requests, limits: 15000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 30.80 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 5688 iterations - [ INFO ] Duration: 15086.99 ms - [ INFO ] Latency: - [ INFO ] Median: 46.98 ms - [ INFO ] Average: 47.54 ms - [ INFO ] Min: 33.22 ms - [ INFO ] Max: 66.20 ms - [ INFO ] Throughput: 377.01 FPS - - -Compare accuracy of the Original and Quantized Models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -As we can see, there is no significant difference between ``INT8`` and -float model result in a single image test. To understand how -quantization influences model prediction precision, we can compare model -accuracy on a dataset. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - int8_pose_stats = test(quantized_pose_model, core, pose_data_loader, pose_validator, num_samples=NUM_TEST_SAMPLES) - - - -.. parsed-literal:: - - 0%| | 0/300 [00:00`__ - -Preprocessing API enables making preprocessing a part of the model -reducing application code and dependency on additional image processing -libraries. The main advantage of Preprocessing API is that preprocessing -steps will be integrated into the execution graph and will be performed -on a selected device (CPU/GPU etc.) rather than always being executed on -CPU as part of an application. This will also improve selected device -utilization. For more information, refer to the overview of -`Preprocessing API -tutorial `__. To -see, how it could be used with YOLOV8 object detection model , please, -see `Convert and Optimize YOLOv8 real-time object detection with -OpenVINO tutorial `__ - -Live demo ---------- - - - -The following code runs model inference on a video: - -.. code:: ipython3 - - import collections - import time - from IPython import display - import cv2 - - - def run_keypoint_detection( - source=0, - flip=False, - use_popup=False, - skip_first_frames=0, - model=pose_model, - device=device.value, - ): - player = None - - ov_config = {} - if device != "CPU": - model.reshape({0: [1, 3, 640, 640]}) - if "GPU" in device or ("AUTO" in device and "GPU" in core.available_devices): - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - compiled_model = core.compile_model(model, device, ov_config) - - def infer(*args): - result = compiled_model(args) - return torch.from_numpy(result[0]) - - pose_model.predictor.inference = infer - - try: - # Create a video player to play with target fps. - player = VideoPlayer(source=source, flip=flip, fps=30, skip_first_frames=skip_first_frames) - # Start capturing. - player.start() - if use_popup: - title = "Press ESC to Exit" - cv2.namedWindow(winname=title, flags=cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE) - - processing_times = collections.deque() - while True: - # Grab the frame. - frame = player.next() - if frame is None: - print("Source ended") - break - # If the frame is larger than full HD, reduce size to improve the performance. - scale = 1280 / max(frame.shape) - if scale < 1: - frame = cv2.resize( - src=frame, - dsize=None, - fx=scale, - fy=scale, - interpolation=cv2.INTER_AREA, - ) - # Get the results - input_image = np.array(frame) - - start_time = time.time() - - detections = pose_model(input_image) - stop_time = time.time() - frame = detections[0].plot() - - processing_times.append(stop_time - start_time) - # Use processing times from last 200 frames. - if len(processing_times) > 200: - processing_times.popleft() - - _, f_width = frame.shape[:2] - # Mean processing time [ms]. - processing_time = np.mean(processing_times) * 1000 - fps = 1000 / processing_time - cv2.putText( - img=frame, - text=f"Inference time: {processing_time:.1f}ms ({fps:.1f} FPS)", - org=(20, 40), - fontFace=cv2.FONT_HERSHEY_COMPLEX, - fontScale=f_width / 1000, - color=(0, 0, 255), - thickness=1, - lineType=cv2.LINE_AA, - ) - # Use this workaround if there is flickering. - if use_popup: - cv2.imshow(winname=title, mat=frame) - key = cv2.waitKey(1) - # escape = 27 - if key == 27: - break - else: - # Encode numpy array to jpg. - _, encoded_img = cv2.imencode(ext=".jpg", img=frame, params=[cv2.IMWRITE_JPEG_QUALITY, 100]) - # Create an IPython image. - i = display.Image(data=encoded_img) - # Display the image in this notebook. - display.clear_output(wait=True) - display.display(i) - # ctrl-c - except KeyboardInterrupt: - print("Interrupted") - # any different error - except RuntimeError as e: - print(e) - finally: - if player is not None: - # Stop capturing. - player.stop() - if use_popup: - cv2.destroyAllWindows() - -Run Keypoint Detection on video -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - VIDEO_SOURCE = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/people.mp4" - -.. code:: ipython3 - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - run_keypoint_detection( - source=VIDEO_SOURCE, - flip=True, - use_popup=False, - model=pose_ov_model, - device=device.value, - ) - - - -.. image:: yolov8-keypoint-detection-with-output_files/yolov8-keypoint-detection-with-output_60_0.png - - -.. parsed-literal:: - - Source ended - diff --git a/docs/notebooks/yolov8-keypoint-detection-with-output_files/yolov8-keypoint-detection-with-output_16_1.png b/docs/notebooks/yolov8-keypoint-detection-with-output_files/yolov8-keypoint-detection-with-output_16_1.png deleted file mode 100644 index 450f6588880d47..00000000000000 --- a/docs/notebooks/yolov8-keypoint-detection-with-output_files/yolov8-keypoint-detection-with-output_16_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:42949a1aafcd0fc859ae809c3a5b14e90c493195f5defbdd62a9824e69955ba2 -size 546464 diff --git a/docs/notebooks/yolov8-keypoint-detection-with-output_files/yolov8-keypoint-detection-with-output_44_1.png b/docs/notebooks/yolov8-keypoint-detection-with-output_files/yolov8-keypoint-detection-with-output_44_1.png deleted file mode 100644 index 35d8c37d3e2dcc..00000000000000 --- a/docs/notebooks/yolov8-keypoint-detection-with-output_files/yolov8-keypoint-detection-with-output_44_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7050d4b9b8a89b7d02cf2d9473c05e9b362f4c0671117fd9d749ef5c0cb194e6 -size 546516 diff --git a/docs/notebooks/yolov8-keypoint-detection-with-output_files/yolov8-keypoint-detection-with-output_60_0.png b/docs/notebooks/yolov8-keypoint-detection-with-output_files/yolov8-keypoint-detection-with-output_60_0.png deleted file mode 100644 index 568f081fc421f9..00000000000000 --- a/docs/notebooks/yolov8-keypoint-detection-with-output_files/yolov8-keypoint-detection-with-output_60_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f7ef4946fe5766299ee01c0bcf12a6e96471cde501a8677e45c3add80356e15f -size 523303 diff --git a/docs/notebooks/yolov8-keypoint-detection-with-output_files/yolov8-keypoint-detection-with-output_9_3.png b/docs/notebooks/yolov8-keypoint-detection-with-output_files/yolov8-keypoint-detection-with-output_9_3.png deleted file mode 100644 index 7afb459978574a..00000000000000 --- a/docs/notebooks/yolov8-keypoint-detection-with-output_files/yolov8-keypoint-detection-with-output_9_3.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0ab6cc809b41e70136a4b37fea93f5f899eb94487dbb3e8a78cb8a90201ca3c1 -size 548409 diff --git a/docs/notebooks/yolov8-obb-with-output.rst b/docs/notebooks/yolov8-obb-with-output.rst deleted file mode 100644 index ba316c662468b7..00000000000000 --- a/docs/notebooks/yolov8-obb-with-output.rst +++ /dev/null @@ -1,730 +0,0 @@ -YOLOv8 Oriented Bounding Boxes Object Detection with OpenVINO™ -============================================================== - -`YOLOv8-OBB `__ is introduced -by Ultralytics. - -Oriented object detection goes a step further than object detection and -introduce an extra angle to locate objects more accurate in an image. - -The output of an oriented object detector is a set of rotated bounding -boxes that exactly enclose the objects in the image, along with class -labels and confidence scores for each box. Object detection is a good -choice when you need to identify objects of interest in a scene, but -don’t need to know exactly where the object is or its exact shape. - - -**Table of contents:** - - -- `Prerequisites <#prerequisites>`__ -- `Get PyTorch model <#get-pytorch-model>`__ -- `Prepare dataset and dataloader <#prepare-dataset-and-dataloader>`__ -- `Run inference <#run-inference>`__ -- `Convert PyTorch model to OpenVINO - IR <#convert-pytorch-model-to-openvino-ir>`__ - - - `Select inference device <#select-inference-device>`__ - - `Compile model <#compile-model>`__ - - `Prepare the model for - inference <#prepare-the-model-for-inference>`__ - - `Run inference <#run-inference>`__ - -- `Quantization <#quantization>`__ -- `Compare inference time and model - sizes. <#compare-inference-time-and-model-sizes>`__ - - - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %pip install -q "ultralytics==8.3.59" "openvino>=2024.0.0" "nncf>=2.9.0" tqdm - -Import required utility functions. The lower cell will download the -notebook_utils Python module from GitHub. - -.. code:: ipython3 - - from pathlib import Path - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import download_file, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("yolov8-obb.ipynb") - -Get PyTorch model -~~~~~~~~~~~~~~~~~ - - - -Generally, PyTorch models represent an instance of the -`torch.nn.Module `__ -class, initialized by a state dictionary with model weights. We will use -the YOLOv8 pretrained OBB large model (also known as ``yolov8l-obbn``) -pre-trained on a DOTAv1 dataset, which is available in this -`repo `__. Similar steps are -also applicable to other YOLOv8 models. - -.. code:: ipython3 - - from ultralytics import YOLO - - model = YOLO("yolov8l-obb.pt") - -Prepare dataset and dataloader -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -YOLOv8-obb is pre-trained on the DOTA dataset. Also, Ultralytics -provides DOTA8 dataset. It is a small, but versatile oriented object -detection dataset composed of the first 8 images of 8 images of the -split DOTAv1 set, 4 for training and 4 for validation. This dataset is -ideal for testing and debugging object detection models, or for -experimenting with new detection approaches. With 8 images, it is small -enough to be easily manageable, yet diverse enough to test training -pipelines for errors and act as a sanity check before training larger -datasets. - -The original model repository uses a Validator wrapper, which represents -the accuracy validation pipeline. It creates dataloader and evaluation -metrics and updates metrics on each data batch produced by the -dataloader. Besides that, it is responsible for data preprocessing and -results postprocessing. For class initialization, the configuration -should be provided. We will use the default setup, but it can be -replaced with some parameters overriding to test on custom data. The -model has connected the task_map, which allows to get a validator class -instance. - -.. code:: ipython3 - - from ultralytics.cfg import get_cfg - from ultralytics.data.utils import check_det_dataset - from ultralytics.utils import DEFAULT_CFG, DATASETS_DIR - - - CFG_URL = "https://raw.githubusercontent.com/ultralytics/ultralytics/main/ultralytics/cfg/datasets/dota8.yaml" - OUT_DIR = Path("./datasets") - CFG_PATH = OUT_DIR / "dota8.yaml" - - if not CFG_PATH.exists(): - download_file(CFG_URL, CFG_PATH.name, CFG_PATH.parent) - - args = get_cfg(cfg=DEFAULT_CFG) - args.data = CFG_PATH - args.task = model.task - - validator = model.task_map[model.task]["validator"](args=args) - - validator.stride = 32 - validator.data = check_det_dataset(str(args.data)) - data_loader = validator.get_dataloader(DATASETS_DIR / "dota8", 1) - example_image_path = list(data_loader)[1]["im_file"][0] - - - -.. parsed-literal:: - - datasets/dota8.yaml: 0%| | 0.00/608 [00:00`__ enables -post-training quantization by adding quantization layers into model -graph and then using a subset of the training dataset to initialize the -parameters of these additional quantization layers. Quantized operations -are executed in ``INT8`` instead of ``FP32``/``FP16`` making model -inference faster. - -The optimization process contains the following steps: - -1. Create a calibration dataset for quantization. -2. Run ``nncf.quantize()`` to obtain quantized model. -3. Save the ``INT8`` model using ``openvino.save_model()`` function. - -Please select below whether you would like to run quantization to -improve model inference speed. - -.. code:: ipython3 - - import ipywidgets as widgets - - INT8_OV_PATH = Path("model/int8_model.xml") - - to_quantize = widgets.Checkbox( - value=True, - description="Quantization", - disabled=False, - ) - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -Let’s load ``skip magic`` extension to skip quantization if -``to_quantize`` is not selected - -.. code:: ipython3 - - # Fetch skip_kernel_extension module - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - %load_ext skip_kernel_extension - -.. code:: ipython3 - - %%skip not $to_quantize.value - - from typing import Dict - - import nncf - - - def transform_fn(data_item: Dict): - input_tensor = validator.preprocess(data_item)["img"].numpy() - return input_tensor - - - quantization_dataset = nncf.Dataset(data_loader, transform_fn) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -Create a quantized model from the pre-trained converted OpenVINO model. - - **NOTE**: Quantization is time and memory consuming operation. - Running quantization code below may take some time. - -.. - - **NOTE**: We use the tiny DOTA8 dataset as a calibration dataset. It - gives a good enough result for tutorial purpose. For batter results, - use a bigger dataset. Usually 300 examples are enough. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - if INT8_OV_PATH.exists(): - print("Loading quantized model") - quantized_model = core.read_model(INT8_OV_PATH) - else: - ov_model.reshape({0: [1, 3, -1, -1]}) - quantized_model = nncf.quantize( - ov_model, - quantization_dataset, - preset=nncf.QuantizationPreset.MIXED, - ) - ov.save_model(quantized_model, INT8_OV_PATH) - - - ov_config = {} - if device.value != "CPU": - quantized_model.reshape({0: [1, 3, 1024, 1024]}) - if "GPU" in device.value or ("AUTO" in device.value and "GPU" in core.available_devices): - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - - model_optimized = core.compile_model(quantized_model, device.value, ov_config) - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -We can reuse the base model pipeline in the same way as for IR model. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - def infer(*args): - result = model_optimized(args)[0] - return torch.from_numpy(result) - - model.predictor.inference = infer - -Run inference - -.. code:: ipython3 - - %%skip not $to_quantize.value - - res = model(example_image_path, device='cpu') - Image.fromarray(res[0].plot()[:, :, ::-1]) - - -.. parsed-literal:: - - - image 1/1 /home/ea/work/openvino_notebooks/notebooks/fast-segment-anything/datasets/dota8/images/train/P1053__1024__0___90.jpg: 1024x1024 240.5ms - Speed: 3.2ms preprocess, 240.5ms inference, 4.2ms postprocess per image at shape (1, 3, 1024, 1024) - - -You can see that the result is almost the same but it has a small -difference. One small vehicle was recognized as two vehicles. But one -large car was also identified, unlike the original model. - -Compare inference time and model sizes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - fp16_ir_model_size = OV_MODEL_PATH.with_suffix(".bin").stat().st_size / 1024 - quantized_model_size = INT8_OV_PATH.with_suffix(".bin").stat().st_size / 1024 - - print(f"FP16 model size: {fp16_ir_model_size:.2f} KB") - print(f"INT8 model size: {quantized_model_size:.2f} KB") - print(f"Model compression rate: {fp16_ir_model_size / quantized_model_size:.3f}") - - -.. parsed-literal:: - - FP16 model size: 86849.05 KB - INT8 model size: 43494.78 KB - Model compression rate: 1.997 - - -.. code:: ipython3 - - # Inference FP32 model (OpenVINO IR) - !benchmark_app -m $OV_MODEL_PATH -d $device.value -api async -shape "[1,3,640,640]" - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ WARNING ] Default duration 120 seconds is used for unknown device AUTO - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.0.0-14509-34caeefd078-releases/2024/0 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.0.0-14509-34caeefd078-releases/2024/0 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 25.07 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [?,3,?,?] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.22/aten::cat/Concat_9) : f32 / [...] / [?,20,16..] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,640,640] - [ INFO ] Reshape model took 10.42 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.22/aten::cat/Concat_9) : f32 / [...] / [1,20,8400] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 645.51 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 0 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 120000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 362.70 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 1620 iterations - [ INFO ] Duration: 121527.01 ms - [ INFO ] Latency: - [ INFO ] Median: 884.92 ms - [ INFO ] Average: 897.13 ms - [ INFO ] Min: 599.38 ms - [ INFO ] Max: 1131.46 ms - [ INFO ] Throughput: 13.33 FPS - - -.. code:: ipython3 - - if INT8_OV_PATH.exists(): - # Inference INT8 model (Quantized model) - !benchmark_app -m $INT8_OV_PATH -d $device.value -api async -shape "[1,3,640,640]" -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.0.0-14509-34caeefd078-releases/2024/0 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.0.0-14509-34caeefd078-releases/2024/0 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 46.47 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [?,3,?,?] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.22/aten::cat/Concat_9) : f32 / [...] / [?,20,16..] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,640,640] - [ INFO ] Reshape model took 20.10 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.22/aten::cat/Concat_9) : f32 / [...] / [1,20,8400] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 1201.42 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 0 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 15000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 124.20 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 708 iterations - [ INFO ] Duration: 15216.46 ms - [ INFO ] Latency: - [ INFO ] Median: 252.23 ms - [ INFO ] Average: 255.76 ms - [ INFO ] Min: 176.97 ms - [ INFO ] Max: 344.41 ms - [ INFO ] Throughput: 46.53 FPS - diff --git a/docs/notebooks/yolov8-obb-with-output_files/yolov8-obb-with-output_10_1.png b/docs/notebooks/yolov8-obb-with-output_files/yolov8-obb-with-output_10_1.png deleted file mode 100644 index 50aaf319c9aa36..00000000000000 --- a/docs/notebooks/yolov8-obb-with-output_files/yolov8-obb-with-output_10_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:119895531f8d84d19c1532048e112156f80b57512e0353b52a48c5899a3d5b3f -size 1421077 diff --git a/docs/notebooks/yolov8-obb-with-output_files/yolov8-obb-with-output_20_1.png b/docs/notebooks/yolov8-obb-with-output_files/yolov8-obb-with-output_20_1.png deleted file mode 100644 index 50aaf319c9aa36..00000000000000 --- a/docs/notebooks/yolov8-obb-with-output_files/yolov8-obb-with-output_20_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:119895531f8d84d19c1532048e112156f80b57512e0353b52a48c5899a3d5b3f -size 1421077 diff --git a/docs/notebooks/yolov8-object-detection-with-output.rst b/docs/notebooks/yolov8-object-detection-with-output.rst deleted file mode 100644 index da39635ef488c2..00000000000000 --- a/docs/notebooks/yolov8-object-detection-with-output.rst +++ /dev/null @@ -1,1902 +0,0 @@ -Convert and Optimize YOLOv8 real-time object detection with OpenVINO™ -===================================================================== - -Real-time object detection is often used as a key component in computer -vision systems. Applications that use real-time object detection models -include video analytics, robotics, autonomous vehicles, multi-object -tracking and object counting, medical image analysis, and many others. - -This tutorial demonstrates step-by-step instructions on how to run and -optimize PyTorch YOLOv8 with OpenVINO. We consider the steps required -for object detection scenario. - -The tutorial consists of the following steps: - -- Prepare the PyTorch model. -- Download and prepare a dataset. -- Validate the original model. -- Convert the PyTorch model to OpenVINO IR. -- Validate the converted model. -- Prepare and run optimization pipeline. -- Compare performance of the FP32 and quantized models. -- Compare accuracy of the FP32 and quantized models. -- Other optimization possibilities with OpenVINO api -- Live demo - - -**Table of contents:** - - -- `Get PyTorch model <#get-pytorch-model>`__ - - - `Prerequisites <#prerequisites>`__ - -- `Instantiate model <#instantiate-model>`__ - - - `Convert model to OpenVINO IR <#convert-model-to-openvino-ir>`__ - - `Verify model inference <#verify-model-inference>`__ - - `Select inference device <#select-inference-device>`__ - - `Test on single image <#test-on-single-image>`__ - -- `Check model accuracy on the - dataset <#check-model-accuracy-on-the-dataset>`__ - - - `Download the validation - dataset <#download-the-validation-dataset>`__ - - `Define validation function <#define-validation-function>`__ - - `Configure Validator helper and create - DataLoader <#configure-validator-helper-and-create-dataloader>`__ - -- `Optimize model using NNCF Post-training Quantization - API <#optimize-model-using-nncf-post-training-quantization-api>`__ - - - `Validate Quantized model - inference <#validate-quantized-model-inference>`__ - -- `Compare the Original and Quantized - Models <#compare-the-original-and-quantized-models>`__ - - - `Compare performance object detection - models <#compare-performance-object-detection-models>`__ - - `Validate quantized model - accuracy <#validate-quantized-model-accuracy>`__ - -- `Next steps <#next-steps>`__ - - - `Async inference pipeline <#async-inference-pipeline>`__ - - `Integration preprocessing to - model <#integration-preprocessing-to-model>`__ - - - `Initialize PrePostProcessing - API <#initialize-prepostprocessing-api>`__ - - `Define input data format <#define-input-data-format>`__ - - `Describe preprocessing - steps <#describe-preprocessing-steps>`__ - - `Integrating Steps into a - Model <#integrating-steps-into-a-model>`__ - - `Postprocessing <#postprocessing>`__ - -- `Live demo <#live-demo>`__ - - - `Run Live Object Detection <#run-live-object-detection>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Get PyTorch model ------------------ - - - -Generally, PyTorch models represent an instance of the -`torch.nn.Module `__ -class, initialized by a state dictionary with model weights. We will use -the YOLOv8 nano model (also known as ``yolov8n``) pre-trained on a COCO -dataset, which is available in this -`repo `__. Similar steps are -also applicable to other YOLOv8 models. Typical steps to obtain a -pre-trained model: 1. Create an instance of a model class. 2. Load a -checkpoint state dict, which contains the pre-trained model weights. 3. -Turn the model to evaluation for switching some operations to inference -mode. - -In this case, the creators of the model provide an API that enables -converting the YOLOv8 model to ONNX and then to OpenVINO IR. Therefore, -we do not need to do these steps manually. - -Prerequisites -^^^^^^^^^^^^^ - - - -Install necessary packages. - -.. code:: ipython3 - - %pip install -q "openvino>=2024.0.0" "nncf>=2.9.0" - %pip install -q "torch>=2.1" "torchvision>=0.16" "ultralytics==8.3.59" onnx tqdm opencv-python --extra-index-url https://download.pytorch.org/whl/cpu - -Import required utility functions. The lower cell will download the -``notebook_utils`` Python module from GitHub. - -.. code:: ipython3 - - from pathlib import Path - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - from notebook_utils import download_file, VideoPlayer, device_widget, quantization_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("yolov8-object-detection.ipynb") - -.. code:: ipython3 - - # Download a test sample - IMAGE_PATH = Path("./data/coco_bike.jpg") - if not IMAGE_PATH.exists(): - download_file( - url="https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bike.jpg", - filename=IMAGE_PATH.name, - directory=IMAGE_PATH.parent, - ) - - -.. parsed-literal:: - - 'data/coco_bike.jpg' already exists. - - - - -.. parsed-literal:: - - PosixPath('/home/akash/intel/openvino_notebooks/notebooks/yolov8-optimization/data/coco_bike.jpg') - - - -Instantiate model ------------------ - - - -There are `several -models `__ available in the -original repository, targeted for different tasks. For loading the -model, required to specify a path to the model checkpoint. It can be -some local path or name available on models hub (in this case model -checkpoint will be downloaded automatically). - -Making prediction, the model accepts a path to input image and returns -list with Results class object. Results contains boxes for object -detection model. Also it contains utilities for processing results, for -example, ``plot()`` method for drawing. - -Let us consider the examples: - -.. code:: ipython3 - - models_dir = Path("./models") - models_dir.mkdir(exist_ok=True) - -.. code:: ipython3 - - from PIL import Image - from ultralytics import YOLO - - DET_MODEL_NAME = "yolov8n" - - det_model = YOLO(models_dir / f"{DET_MODEL_NAME}.pt") - label_map = det_model.model.names - - res = det_model(IMAGE_PATH) - Image.fromarray(res[0].plot()[:, :, ::-1]) - - -.. parsed-literal:: - - Downloading https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n.pt to 'models/yolov8n.pt'... - - -.. parsed-literal:: - - 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 6.25M/6.25M [00:02<00:00, 2.29MB/s] - - -.. parsed-literal:: - - - image 1/1 /home/akash/intel/openvino_notebooks/notebooks/yolov8-optimization/data/coco_bike.jpg: 480x640 2 bicycles, 2 cars, 1 dog, 82.9ms - Speed: 2.4ms preprocess, 82.9ms inference, 475.6ms postprocess per image at shape (1, 3, 480, 640) - - - - -.. image:: yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_9_3.png - - - -Convert model to OpenVINO IR -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -YOLOv8 provides API for convenient model exporting to different formats -including OpenVINO IR. ``model.export`` is responsible for model -conversion. We need to specify the format, and additionally, we can -preserve dynamic shapes in the model. - -.. code:: ipython3 - - # object detection model - det_model_path = models_dir / f"{DET_MODEL_NAME}_openvino_model/{DET_MODEL_NAME}.xml" - if not det_model_path.exists(): - det_model.export(format="openvino", dynamic=True, half=True) - - -.. parsed-literal:: - - Ultralytics YOLOv8.2.24 🚀 Python-3.8.10 torch-2.1.0+cu121 CPU (Intel Core(TM) i9-10980XE 3.00GHz) - - PyTorch: starting from 'models/yolov8n.pt' with input shape (1, 3, 640, 640) BCHW and output shape(s) (1, 84, 8400) (6.2 MB) - - OpenVINO: starting export with openvino 2024.3.0-16041-1e3b88e4e3f-releases/2024/3... - OpenVINO: export success ✅ 1.7s, saved as 'models/yolov8n_openvino_model/' (6.4 MB) - - Export complete (3.1s) - Results saved to /home/akash/intel/openvino_notebooks/notebooks/yolov8-optimization/models - Predict: yolo predict task=detect model=models/yolov8n_openvino_model imgsz=640 half - Validate: yolo val task=detect model=models/yolov8n_openvino_model imgsz=640 data=coco.yaml half - Visualize: https://netron.app - - -Verify model inference -~~~~~~~~~~~~~~~~~~~~~~ - - - -We can reuse the base model pipeline for pre- and postprocessing just -replacing the inference method where we will use the IR model for -inference. - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -Select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -Test on single image -~~~~~~~~~~~~~~~~~~~~ - - - -Now, once we have defined preprocessing and postprocessing steps, we are -ready to check model prediction for object detection. - -.. code:: ipython3 - - import torch - import openvino as ov - - core = ov.Core() - - det_ov_model = core.read_model(det_model_path) - - ov_config = {} - if device.value != "CPU": - det_ov_model.reshape({0: [1, 3, 640, 640]}) - if "GPU" in device.value or ("AUTO" in device.value and "GPU" in core.available_devices): - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - det_compiled_model = core.compile_model(det_ov_model, device.value, ov_config) - - - def infer(*args): - result = det_compiled_model(args) - return torch.from_numpy(result[0]) - - - det_model.predictor.inference = infer - det_model.predictor.model.pt = False - - res = det_model(IMAGE_PATH) - Image.fromarray(res[0].plot()[:, :, ::-1]) - - -.. parsed-literal:: - - - image 1/1 /home/akash/intel/openvino_notebooks/notebooks/yolov8-optimization/data/coco_bike.jpg: 640x640 2 bicycles, 2 cars, 1 dog, 16.1ms - Speed: 3.4ms preprocess, 16.1ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 640) - - - - -.. image:: yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_16_1.png - - - -Check model accuracy on the dataset ------------------------------------ - - - -For comparing the optimized model result with the original, it is good -to know some measurable results in terms of model accuracy on the -validation dataset. - -Download the validation dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -YOLOv8 is pre-trained on the COCO dataset, so to evaluate the model -accuracy we need to download it. According to the instructions provided -in the YOLOv8 repo, we also need to download annotations in the format -used by the author of the model, for use with the original model -evaluation function. - - **Note**: The initial dataset download may take a few minutes to - complete. The download speed will vary depending on the quality of - your internet connection. - -.. code:: ipython3 - - from zipfile import ZipFile - - from ultralytics.data.utils import DATASETS_DIR - - - DATA_URL = "http://images.cocodataset.org/zips/val2017.zip" - LABELS_URL = "https://github.com/ultralytics/yolov5/releases/download/v1.0/coco2017labels-segments.zip" - CFG_URL = "https://raw.githubusercontent.com/ultralytics/ultralytics/v8.1.0/ultralytics/cfg/datasets/coco.yaml" - - OUT_DIR = DATASETS_DIR - - DATA_PATH = OUT_DIR / "val2017.zip" - LABELS_PATH = OUT_DIR / "coco2017labels-segments.zip" - CFG_PATH = OUT_DIR / "coco.yaml" - - if not (OUT_DIR / "coco/labels").exists(): - download_file(DATA_URL, DATA_PATH.name, DATA_PATH.parent) - download_file(LABELS_URL, LABELS_PATH.name, LABELS_PATH.parent) - download_file(CFG_URL, CFG_PATH.name, CFG_PATH.parent) - with ZipFile(LABELS_PATH, "r") as zip_ref: - zip_ref.extractall(OUT_DIR) - with ZipFile(DATA_PATH, "r") as zip_ref: - zip_ref.extractall(OUT_DIR / "coco/images") - - - -.. parsed-literal:: - - /home/akash/intel/NNCF/nncf/examples/post_training_quantization/openvino/yolov8/datasets/val2017.zip: 0%| … - - -.. parsed-literal:: - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - IOPub message rate exceeded. - The Jupyter server will temporarily stop sending output - to the client in order to avoid crashing it. - To change this limit, set the config variable - `--ServerApp.iopub_msg_rate_limit`. - - Current values: - ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec) - ServerApp.rate_limit_window=3.0 (secs) - - - -.. parsed-literal:: - - '/home/akash/intel/NNCF/nncf/examples/post_training_quantization/openvino/yolov8/datasets/coco2017labels-segments.zip' already exists. - - - -.. parsed-literal:: - - /home/akash/intel/NNCF/nncf/examples/post_training_quantization/openvino/yolov8/datasets/coco.yaml: 0%| … - - -Define validation function -~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - import numpy as np - - from tqdm.notebook import tqdm - from ultralytics.utils.metrics import ConfusionMatrix - - - def test( - model: ov.Model, - core: ov.Core, - data_loader: torch.utils.data.DataLoader, - validator, - num_samples: int = None, - ): - """ - OpenVINO YOLOv8 model accuracy validation function. Runs model validation on dataset and returns metrics - Parameters: - model (Model): OpenVINO model - data_loader (torch.utils.data.DataLoader): dataset loader - validator: instance of validator class - num_samples (int, *optional*, None): validate model only on specified number samples, if provided - Returns: - stats: (Dict[str, float]) - dictionary with aggregated accuracy metrics statistics, key is metric name, value is metric value - """ - validator.seen = 0 - validator.jdict = [] - validator.stats = dict(tp=[], conf=[], pred_cls=[], target_cls=[], target_img=[]) - validator.batch_i = 1 - validator.confusion_matrix = ConfusionMatrix(nc=validator.nc) - model.reshape({0: [1, 3, -1, -1]}) - compiled_model = core.compile_model(model) - for batch_i, batch in enumerate(tqdm(data_loader, total=num_samples)): - if num_samples is not None and batch_i == num_samples: - break - batch = validator.preprocess(batch) - results = compiled_model(batch["img"]) - preds = torch.from_numpy(results[compiled_model.output(0)]) - preds = validator.postprocess(preds) - validator.update_metrics(preds, batch) - stats = validator.get_stats() - return stats - - - def print_stats(stats: np.ndarray, total_images: int, total_objects: int): - """ - Helper function for printing accuracy statistic - Parameters: - stats: (Dict[str, float]) - dictionary with aggregated accuracy metrics statistics, key is metric name, value is metric value - total_images (int) - number of evaluated images - total objects (int) - Returns: - None - """ - print("Boxes:") - mp, mr, map50, mean_ap = ( - stats["metrics/precision(B)"], - stats["metrics/recall(B)"], - stats["metrics/mAP50(B)"], - stats["metrics/mAP50-95(B)"], - ) - # Print results - print(" Best mean average:") - s = ("%20s" + "%12s" * 6) % ( - "Class", - "Images", - "Labels", - "Precision", - "Recall", - "mAP@.5", - "mAP@.5:.95", - ) - print(s) - pf = "%20s" + "%12i" * 2 + "%12.3g" * 4 # print format - print(pf % ("all", total_images, total_objects, mp, mr, map50, mean_ap)) - if "metrics/precision(M)" in stats: - s_mp, s_mr, s_map50, s_mean_ap = ( - stats["metrics/precision(M)"], - stats["metrics/recall(M)"], - stats["metrics/mAP50(M)"], - stats["metrics/mAP50-95(M)"], - ) - # Print results - print(" Macro average mean:") - s = ("%20s" + "%12s" * 6) % ( - "Class", - "Images", - "Labels", - "Precision", - "Recall", - "mAP@.5", - "mAP@.5:.95", - ) - print(s) - pf = "%20s" + "%12i" * 2 + "%12.3g" * 4 # print format - print(pf % ("all", total_images, total_objects, s_mp, s_mr, s_map50, s_mean_ap)) - -Configure Validator helper and create DataLoader -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The original model repository uses a ``Validator`` wrapper, which -represents the accuracy validation pipeline. It creates dataloader and -evaluation metrics and updates metrics on each data batch produced by -the dataloader. Besides that, it is responsible for data preprocessing -and results postprocessing. For class initialization, the configuration -should be provided. We will use the default setup, but it can be -replaced with some parameters overriding to test on custom data. The -model has connected the ``ValidatorClass`` method, which creates a -validator class instance. - -.. code:: ipython3 - - from ultralytics.utils import DEFAULT_CFG - from ultralytics.cfg import get_cfg - from ultralytics.data.converter import coco80_to_coco91_class - from ultralytics.data.utils import check_det_dataset - - args = get_cfg(cfg=DEFAULT_CFG) - args.data = str(CFG_PATH) - -.. code:: ipython3 - - det_validator = det_model.task_map[det_model.task]["validator"](args=args) - -.. code:: ipython3 - - det_validator.data = check_det_dataset(args.data) - det_validator.stride = 32 - det_data_loader = det_validator.get_dataloader(OUT_DIR / "coco", 1) - - -.. parsed-literal:: - - val: Scanning /home/akash/intel/NNCF/nncf/examples/post_training_quantization/openvino/yolov8/datasets/coco/labels/val2017.cache... 4952 images, - - -.. code:: ipython3 - - det_validator.is_coco = True - det_validator.class_map = coco80_to_coco91_class() - det_validator.names = det_model.model.names - det_validator.metrics.names = det_validator.names - det_validator.nc = det_model.model.model[-1].nc - -After definition test function and validator creation, we are ready for -getting accuracy metrics >\ **Note**: Model evaluation is time consuming -process and can take several minutes, depending on the hardware. For -reducing calculation time, we define ``num_samples`` parameter with -evaluation subset size, but in this case, accuracy can be noncomparable -with originally reported by the authors of the model, due to validation -subset difference. *To validate the models on the full dataset set -``NUM_TEST_SAMPLES = None``.* - -.. code:: ipython3 - - NUM_TEST_SAMPLES = 300 - -.. code:: ipython3 - - fp_det_stats = test(det_ov_model, core, det_data_loader, det_validator, num_samples=NUM_TEST_SAMPLES) - - - -.. parsed-literal:: - - 0%| | 0/300 [00:00`__ provides a suite of -advanced algorithms for Neural Networks inference optimization in -OpenVINO with minimal accuracy drop. We will use 8-bit quantization in -post-training mode (without the fine-tuning pipeline) to optimize -YOLOv8. - -The optimization process contains the following steps: - -1. Create a Dataset for quantization. -2. Run ``nncf.quantize`` for getting an optimized model. -3. Serialize OpenVINO IR model, using the ``openvino.runtime.serialize`` - function. - -Please select below whether you would like to run quantization to -improve model inference speed. - -.. code:: ipython3 - - int8_model_det_path = models_dir / f"{DET_MODEL_NAME}_openvino_int8_model/{DET_MODEL_NAME}.xml" - quantized_det_model = None - - to_quantize = quantization_widget() - - to_quantize - - - - -.. parsed-literal:: - - Checkbox(value=True, description='Quantization') - - - -Let’s load ``skip magic`` extension to skip quantization if -``to_quantize`` is not selected - -.. code:: ipython3 - - # Fetch skip_kernel_extension module - if not Path("skip_kernel_extension.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", - ) - open("skip_kernel_extension.py", "w").write(r.text) - - %load_ext skip_kernel_extension - -Reuse validation dataloader in accuracy testing for quantization. For -that, it should be wrapped into the ``nncf.Dataset`` object and define a -transformation function for getting only input tensors. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - import nncf - from typing import Dict - - - def transform_fn(data_item:Dict): - """ - Quantization transform function. Extracts and preprocess input data from dataloader item for quantization. - Parameters: - data_item: Dict with data item produced by DataLoader during iteration - Returns: - input_tensor: Input data for quantization - """ - input_tensor = det_validator.preprocess(data_item)['img'].numpy() - return input_tensor - - - quantization_dataset = nncf.Dataset(det_data_loader, transform_fn) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino - - -The ``nncf.quantize`` function provides an interface for model -quantization. It requires an instance of the OpenVINO Model and -quantization dataset. Optionally, some additional parameters for the -configuration quantization process (number of samples for quantization, -preset, ignored scope, etc.) can be provided. YOLOv8 model contains -non-ReLU activation functions, which require asymmetric quantization of -activations. To achieve a better result, we will use a ``mixed`` -quantization preset. It provides symmetric quantization of weights and -asymmetric quantization of activations. For more accurate results, we -should keep the operation in the postprocessing subgraph in floating -point precision, using the ``ignored_scope`` parameter. - - **Note**: Model post-training quantization is time-consuming process. - Be patient, it can take several minutes depending on your hardware. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - ignored_scope = nncf.IgnoredScope( # post-processing - subgraphs=[ - nncf.Subgraph(inputs=['__module.model.22/aten::cat/Concat', - '__module.model.22/aten::cat/Concat_1', - '__module.model.22/aten::cat/Concat_2'], - outputs=['__module.model.22/aten::cat/Concat_7']) - ] - ) - - # Detection model - quantized_det_model = nncf.quantize( - det_ov_model, - quantization_dataset, - preset=nncf.QuantizationPreset.MIXED, - ignored_scope=ignored_scope - ) - - -.. parsed-literal:: - - INFO:nncf:105 ignored nodes were found by subgraphs in the NNCFGraph - INFO:nncf:Not adding activation input quantizer for operation: 126 __module.model.22/aten::cat/Concat - INFO:nncf:Not adding activation input quantizer for operation: 133 __module.model.22/aten::view/Reshape_3 - INFO:nncf:Not adding activation input quantizer for operation: 240 __module.model.22/aten::cat/Concat_1 - INFO:nncf:Not adding activation input quantizer for operation: 250 __module.model.22/aten::view/Reshape_4 - INFO:nncf:Not adding activation input quantizer for operation: 297 __module.model.22/aten::cat/Concat_2 - INFO:nncf:Not adding activation input quantizer for operation: 299 __module.model.22/aten::view/Reshape_5 - INFO:nncf:Not adding activation input quantizer for operation: 143 __module.model.22/aten::cat/Concat_4 - INFO:nncf:Not adding activation input quantizer for operation: 154 __module.model.22/prim::ListUnpack - INFO:nncf:Not adding activation input quantizer for operation: 166 __module.model.22.dfl/aten::view/Reshape - INFO:nncf:Not adding activation input quantizer for operation: 167 __module.model.22/aten::sigmoid/Sigmoid - INFO:nncf:Not adding activation input quantizer for operation: 180 __module.model.22.dfl/aten::transpose/Transpose - INFO:nncf:Not adding activation input quantizer for operation: 191 __module.model.22.dfl/aten::softmax/Softmax - INFO:nncf:Not adding activation input quantizer for operation: 201 __module.model.22.dfl.conv/aten::_convolution/Convolution - INFO:nncf:Not adding activation input quantizer for operation: 208 __module.model.22.dfl/aten::view/Reshape_1 - INFO:nncf:Not adding activation input quantizer for operation: 217 __module.model.22/prim::ListUnpack/VariadicSplit - INFO:nncf:Not adding activation input quantizer for operation: 225 __module.model.22/aten::sub/Subtract - INFO:nncf:Not adding activation input quantizer for operation: 226 __module.model.22/aten::add/Add_6 - INFO:nncf:Not adding activation input quantizer for operation: 235 __module.model.22/aten::add/Add_7 - 244 __module.model.22/aten::div/Divide - - INFO:nncf:Not adding activation input quantizer for operation: 236 __module.model.22/aten::sub/Subtract_1 - INFO:nncf:Not adding activation input quantizer for operation: 245 __module.model.22/aten::cat/Concat_5 - INFO:nncf:Not adding activation input quantizer for operation: 214 __module.model.22/aten::mul/Multiply_3 - INFO:nncf:Not adding activation input quantizer for operation: 181 __module.model.22/aten::cat/Concat_7 - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - - -.. parsed-literal:: - - Output() - - - - - - - - - - - - - - - - - -.. code:: ipython3 - - %%skip not $to_quantize.value - - print(f"Quantized detection model will be saved to {int8_model_det_path}") - ov.save_model(quantized_det_model, str(int8_model_det_path)) - - -.. parsed-literal:: - - Quantized detection model will be saved to models/yolov8n_openvino_int8_model/yolov8n.xml - - -Validate Quantized model inference -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -``nncf.quantize`` returns the OpenVINO Model class instance, which is -suitable for loading on a device for making predictions. ``INT8`` model -input data and output result formats have no difference from the -floating point model representation. Therefore, we can reuse the same -``detect`` function defined above for getting the ``INT8`` model result -on the image. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - device - -.. code:: ipython3 - - %%skip not $to_quantize.value - - ov_config = {} - if device.value != "CPU": - quantized_det_model.reshape({0: [1, 3, 640, 640]}) - if "GPU" in device.value or ("AUTO" in device.value and "GPU" in core.available_devices): - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - quantized_det_compiled_model = core.compile_model(quantized_det_model, device.value, ov_config) - - - def infer(*args): - result = quantized_det_compiled_model(args) - return torch.from_numpy(result[0]) - - det_model.predictor.inference = infer - - res = det_model(IMAGE_PATH) - display(Image.fromarray(res[0].plot()[:, :, ::-1])) - - -.. parsed-literal:: - - - image 1/1 /home/akash/intel/openvino_notebooks/notebooks/yolov8-optimization/data/coco_bike.jpg: 640x640 2 bicycles, 2 cars, 1 dog, 10.9ms - Speed: 3.7ms preprocess, 10.9ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 640) - - - -.. image:: yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_43_1.png - - -Compare the Original and Quantized Models ------------------------------------------ - - - -Compare performance object detection models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Finally, use the OpenVINO `Benchmark -Tool `__ -to measure the inference performance of the ``FP32`` and ``INT8`` -models. - - **Note**: For more accurate performance, it is recommended to run - ``benchmark_app`` in a terminal/command prompt after closing other - applications. Run - ``benchmark_app -m -d CPU -shape ""`` to - benchmark async inference on CPU on specific input data shape for one - minute. Change ``CPU`` to ``GPU`` to benchmark on GPU. Run - ``benchmark_app --help`` to see an overview of all command-line - options. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - device - -.. code:: ipython3 - - if int8_model_det_path.exists(): - # Inference FP32 model (OpenVINO IR) - !benchmark_app -m $det_model_path -d $device.value -api async -shape "[1,3,640,640]" - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ WARNING ] Default duration 120 seconds is used for unknown device AUTO - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.3.0-16041-1e3b88e4e3f-releases/2024/3 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.3.0-16041-1e3b88e4e3f-releases/2024/3 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 13.54 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [?,3,?,?] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.22/aten::cat/Concat_7) : f32 / [...] / [?,84,21..] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,640,640] - [ INFO ] Reshape model took 7.16 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.22/aten::cat/Concat_7) : f32 / [...] / [1,84,8400] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 365.94 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 120000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 40.11 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 16968 iterations - [ INFO ] Duration: 120039.34 ms - [ INFO ] Latency: - [ INFO ] Median: 84.56 ms - [ INFO ] Average: 84.70 ms - [ INFO ] Min: 38.59 ms - [ INFO ] Max: 140.59 ms - [ INFO ] Throughput: 141.35 FPS - - -.. code:: ipython3 - - if int8_model_det_path.exists(): - # Inference INT8 model (OpenVINO IR) - !benchmark_app -m $int8_model_det_path -d $device.value -api async -shape "[1,3,640,640]" -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.3.0-16041-1e3b88e4e3f-releases/2024/3 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.3.0-16041-1e3b88e4e3f-releases/2024/3 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 41.77 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] x (node: x) : f32 / [...] / [1,3,?,?] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.22/aten::cat/Concat_7) : f32 / [...] / [1,84,21..] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'x': [1,3,640,640] - [ INFO ] Reshape model took 14.55 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] x (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] ***NO_NAME*** (node: __module.model.22/aten::cat/Concat_7) : f32 / [...] / [1,84,8400] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 535.46 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 18 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] MODEL_DISTRIBUTION_POLICY: set() - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 18 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 18 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [ INFO ] PERF_COUNT: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'x'!. This input will be filled with random values! - [ INFO ] Fill input 'x' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 18 inference requests, limits: 15000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 31.53 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 5598 iterations - [ INFO ] Duration: 15049.79 ms - [ INFO ] Latency: - [ INFO ] Median: 47.83 ms - [ INFO ] Average: 48.18 ms - [ INFO ] Min: 30.12 ms - [ INFO ] Max: 75.38 ms - [ INFO ] Throughput: 371.97 FPS - - -Validate quantized model accuracy -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -As we can see, there is no significant difference between ``INT8`` and -float model result in a single image test. To understand how -quantization influences model prediction precision, we can compare model -accuracy on a dataset. - -.. code:: ipython3 - - %%skip not $to_quantize.value - - int8_det_stats = test(quantized_det_model, core, det_data_loader, det_validator, num_samples=NUM_TEST_SAMPLES) - - - -.. parsed-literal:: - - 0%| | 0/300 [00:00`__ - -Integration preprocessing to model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Preprocessing API enables making preprocessing a part of the model -reducing application code and dependency on additional image processing -libraries. The main advantage of Preprocessing API is that preprocessing -steps will be integrated into the execution graph and will be performed -on a selected device (CPU/GPU etc.) rather than always being executed on -CPU as part of an application. This will improve selected device -utilization. - -For more information, refer to the overview of `Preprocessing -API `__. - -For example, we can integrate converting input data layout and -normalization defined in ``image_to_tensor`` function. - -The integration process consists of the following steps: 1. Initialize a -PrePostProcessing object. 2. Define the input data format. 3. Describe -preprocessing steps. 4. Integrating Steps into a Model. - -Initialize PrePostProcessing API -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -The ``openvino.preprocess.PrePostProcessor`` class enables specifying -preprocessing and postprocessing steps for a model. - -.. code:: ipython3 - - from openvino.preprocess import PrePostProcessor - - ppp = PrePostProcessor(quantized_det_model if quantized_det_model is not None else det_model) - -Define input data format -^^^^^^^^^^^^^^^^^^^^^^^^ - - - -To address particular input of a model/preprocessor, the -``input(input_id)`` method, where ``input_id`` is a positional index or -input tensor name for input in ``model.inputs``, if a model has a single -input, ``input_id`` can be omitted. After reading the image from the -disc, it contains U8 pixels in the ``[0, 255]`` range and is stored in -the ``NHWC`` layout. To perform a preprocessing conversion, we should -provide this to the tensor description. - -.. code:: ipython3 - - ppp.input(0).tensor().set_shape([1, 640, 640, 3]).set_element_type(ov.Type.u8).set_layout(ov.Layout("NHWC")) - pass - -To perform layout conversion, we also should provide information about -layout expected by model - -Describe preprocessing steps -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -Our preprocessing function contains the following steps: \* Convert the -data type from ``U8`` to ``FP32``. \* Convert the data layout from -``NHWC`` to ``NCHW`` format. \* Normalize each pixel by dividing on -scale factor 255. - -``ppp.input(input_id).preprocess()`` is used for defining a sequence of -preprocessing steps: - -.. code:: ipython3 - - ppp.input(0).preprocess().convert_element_type(ov.Type.f32).convert_layout(ov.Layout("NCHW")).scale([255.0, 255.0, 255.0]) - - print(ppp) - - -.. parsed-literal:: - - Input "x": - User's input tensor: [1,640,640,3], [N,H,W,C], u8 - Model's expected tensor: [1,3,?,?], [N,C,H,W], f32 - Pre-processing steps (3): - convert type (f32): ([1,640,640,3], [N,H,W,C], u8) -> ([1,640,640,3], [N,H,W,C], f32) - convert layout [N,C,H,W]: ([1,640,640,3], [N,H,W,C], f32) -> ([1,3,640,640], [N,C,H,W], f32) - scale (255,255,255): ([1,3,640,640], [N,C,H,W], f32) -> ([1,3,640,640], [N,C,H,W], f32) - - - -Integrating Steps into a Model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - - -Once the preprocessing steps have been finished, the model can be -finally built. Additionally, we can save a completed model to OpenVINO -IR, using ``openvino.runtime.serialize``. - -.. code:: ipython3 - - quantized_model_with_preprocess = ppp.build() - - with_preprocess_path = ( - int8_model_det_path.with_name(f"{DET_MODEL_NAME}_with_preprocess.xml") - if quantized_det_model is not None - else det_model_path.with_name(f"{DET_MODEL_NAME}_with_preprocess.xml") - ) - ov.save_model(quantized_model_with_preprocess, str(with_preprocess_path)) - -The model with integrated preprocessing is ready for loading to a -device. - -.. code:: ipython3 - - from typing import Tuple, Dict - import cv2 - import numpy as np - from ultralytics.utils.plotting import colors - import random - - - def plot_one_box( - box: np.ndarray, - img: np.ndarray, - color: Tuple[int, int, int] = None, - label: str = None, - line_thickness: int = 5, - ): - """ - Helper function for drawing single bounding box on image - Parameters: - x (np.ndarray): bounding box coordinates in format [x1, y1, x2, y2] - img (no.ndarray): input image - color (Tuple[int, int, int], *optional*, None): color in BGR format for drawing box, if not specified will be selected randomly - label (str, *optonal*, None): box label string, if not provided will not be provided as drowing result - line_thickness (int, *optional*, 5): thickness for box drawing lines - """ - # Plots one bounding box on image img - tl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 # line/font thickness - color = color or [random.randint(0, 255) for _ in range(3)] - c1, c2 = (int(box[0]), int(box[1])), (int(box[2]), int(box[3])) - cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) - if label: - tf = max(tl - 1, 1) # font thickness - t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] - c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 - cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled - cv2.putText( - img, - label, - (c1[0], c1[1] - 2), - 0, - tl / 3, - [225, 255, 255], - thickness=tf, - lineType=cv2.LINE_AA, - ) - - return img - - - def draw_results(results: Dict, source_image: np.ndarray, label_map: Dict): - """ - Helper function for drawing bounding boxes on image - Parameters: - image_res (np.ndarray): detection predictions in format [x1, y1, x2, y2, score, label_id] - source_image (np.ndarray): input image for drawing - label_map; (Dict[int, str]): label_id to class name mapping - Returns: - Image with boxes - """ - boxes = results["det"] - for idx, (*xyxy, conf, lbl) in enumerate(boxes): - label = f"{label_map[int(lbl)]} {conf:.2f}" - source_image = plot_one_box(xyxy, source_image, label=label, color=colors(int(lbl)), line_thickness=1) - return source_image - -Postprocessing -'''''''''''''' - - - -The model output contains detection boxes candidates, it is a tensor -with the [-1,84,-1] shape in the B,84,N format, where: - -B - batch size N - number of detection boxes For getting the final -prediction, we need to apply a non-maximum suppression algorithm and -rescale box coordinates to the original image size. - -Finally, detection box has the [x, y, h, w, class_no_1, …, class_no_80] -format, where: - -(x, y) - raw coordinates of box center h, w - raw height and width of -the box class_no_1, …, class_no_80 - probability distribution over the -classes. - -.. code:: ipython3 - - from typing import Tuple - from ultralytics.utils import ops - import torch - import numpy as np - - - def letterbox( - img: np.ndarray, - new_shape: Tuple[int, int] = (640, 640), - color: Tuple[int, int, int] = (114, 114, 114), - auto: bool = False, - scale_fill: bool = False, - scaleup: bool = False, - stride: int = 32, - ): - """ - Resize image and padding for detection. Takes image as input, - resizes image to fit into new shape with saving original aspect ratio and pads it to meet stride-multiple constraints - - Parameters: - img (np.ndarray): image for preprocessing - new_shape (Tuple(int, int)): image size after preprocessing in format [height, width] - color (Tuple(int, int, int)): color for filling padded area - auto (bool): use dynamic input size, only padding for stride constrins applied - scale_fill (bool): scale image to fill new_shape - scaleup (bool): allow scale image if it is lower then desired input size, can affect model accuracy - stride (int): input padding stride - Returns: - img (np.ndarray): image after preprocessing - ratio (Tuple(float, float)): hight and width scaling ratio - padding_size (Tuple(int, int)): height and width padding size - - - """ - # Resize and pad image while meeting stride-multiple constraints - shape = img.shape[:2] # current shape [height, width] - if isinstance(new_shape, int): - new_shape = (new_shape, new_shape) - - # Scale ratio (new / old) - r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) - if not scaleup: # only scale down, do not scale up (for better test mAP) - r = min(r, 1.0) - - # Compute padding - ratio = r, r # width, height ratios - new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) - dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding - if auto: # minimum rectangle - dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding - elif scale_fill: # stretch - dw, dh = 0.0, 0.0 - new_unpad = (new_shape[1], new_shape[0]) - ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios - - dw /= 2 # divide padding into 2 sides - dh /= 2 - - if shape[::-1] != new_unpad: # resize - img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR) - top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) - left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) - img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border - return img, ratio, (dw, dh) - - - def postprocess( - pred_boxes: np.ndarray, - input_hw: Tuple[int, int], - orig_img: np.ndarray, - min_conf_threshold: float = 0.25, - nms_iou_threshold: float = 0.7, - agnosting_nms: bool = False, - max_detections: int = 300, - ): - """ - YOLOv8 model postprocessing function. Applied non maximum supression algorithm to detections and rescale boxes to original image size - Parameters: - pred_boxes (np.ndarray): model output prediction boxes - input_hw (np.ndarray): preprocessed image - orig_image (np.ndarray): image before preprocessing - min_conf_threshold (float, *optional*, 0.25): minimal accepted confidence for object filtering - nms_iou_threshold (float, *optional*, 0.45): minimal overlap score for removing objects duplicates in NMS - agnostic_nms (bool, *optiona*, False): apply class agnostinc NMS approach or not - max_detections (int, *optional*, 300): maximum detections after NMS - Returns: - pred (List[Dict[str, np.ndarray]]): list of dictionary with det - detected boxes in format [x1, y1, x2, y2, score, label] - """ - nms_kwargs = {"agnostic": agnosting_nms, "max_det": max_detections} - preds = ops.non_max_suppression(torch.from_numpy(pred_boxes), min_conf_threshold, nms_iou_threshold, nc=80, **nms_kwargs) - - results = [] - for i, pred in enumerate(preds): - shape = orig_img[i].shape if isinstance(orig_img, list) else orig_img.shape - if not len(pred): - results.append({"det": [], "segment": []}) - continue - pred[:, :4] = ops.scale_boxes(input_hw, pred[:, :4], shape).round() - results.append({"det": pred}) - - return results - -Now, we can skip these preprocessing steps in detect function: - -.. code:: ipython3 - - def detect_without_preprocess(image: np.ndarray, model: ov.Model): - """ - OpenVINO YOLOv8 model with integrated preprocessing inference function. Preprocess image, runs model inference and postprocess results using NMS. - Parameters: - image (np.ndarray): input image. - model (Model): OpenVINO compiled model. - Returns: - detections (np.ndarray): detected boxes in format [x1, y1, x2, y2, score, label] - """ - output_layer = model.output(0) - img = letterbox(image)[0] - input_tensor = np.expand_dims(img, 0) - input_hw = img.shape[:2] - result = model(input_tensor)[output_layer] - detections = postprocess(result, input_hw, image) - return detections - - - compiled_model = core.compile_model(quantized_model_with_preprocess, device.value) - input_image = np.array(Image.open(IMAGE_PATH)) - detections = detect_without_preprocess(input_image, compiled_model)[0] - image_with_boxes = draw_results(detections, input_image, label_map) - - Image.fromarray(image_with_boxes) - - - - -.. image:: yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_70_0.png - - - -Live demo ---------- - - - -The following code runs model inference on a video: - -.. code:: ipython3 - - import collections - import time - from IPython import display - - det_ov_model - - - # Main processing function to run object detection. - def run_object_detection( - source=0, - flip=False, - use_popup=False, - skip_first_frames=0, - model=det_model, - device=device.value, - ): - player = None - ov_config = {} - if device != "CPU": - model.reshape({0: [1, 3, 640, 640]}) - if "GPU" in device or ("AUTO" in device and "GPU" in core.available_devices): - ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - compiled_model = core.compile_model(model, device, ov_config) - - def infer(*args): - result = compiled_model(args) - return torch.from_numpy(result[0]) - - det_model.predictor.inference = infer - - try: - # Create a video player to play with target fps. - player = VideoPlayer(source=source, flip=flip, fps=30, skip_first_frames=skip_first_frames) - # Start capturing. - player.start() - if use_popup: - title = "Press ESC to Exit" - cv2.namedWindow(winname=title, flags=cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE) - - processing_times = collections.deque() - while True: - # Grab the frame. - frame = player.next() - if frame is None: - print("Source ended") - break - # If the frame is larger than full HD, reduce size to improve the performance. - scale = 1280 / max(frame.shape) - if scale < 1: - frame = cv2.resize( - src=frame, - dsize=None, - fx=scale, - fy=scale, - interpolation=cv2.INTER_AREA, - ) - # Get the results. - input_image = np.array(frame) - - start_time = time.time() - detections = det_model(input_image, verbose=False) - stop_time = time.time() - frame = detections[0].plot() - - processing_times.append(stop_time - start_time) - # Use processing times from last 200 frames. - if len(processing_times) > 200: - processing_times.popleft() - - _, f_width = frame.shape[:2] - # Mean processing time [ms]. - processing_time = np.mean(processing_times) * 1000 - fps = 1000 / processing_time - cv2.putText( - img=frame, - text=f"Inference time: {processing_time:.1f}ms ({fps:.1f} FPS)", - org=(20, 40), - fontFace=cv2.FONT_HERSHEY_COMPLEX, - fontScale=f_width / 1000, - color=(0, 0, 255), - thickness=1, - lineType=cv2.LINE_AA, - ) - # Use this workaround if there is flickering. - if use_popup: - cv2.imshow(winname=title, mat=frame) - key = cv2.waitKey(1) - # escape = 27 - if key == 27: - break - else: - # Encode numpy array to jpg. - _, encoded_img = cv2.imencode(ext=".jpg", img=frame, params=[cv2.IMWRITE_JPEG_QUALITY, 100]) - # Create an IPython image. - i = display.Image(data=encoded_img) - # Display the image in this notebook. - display.clear_output(wait=True) - display.display(i) - # ctrl-c - except KeyboardInterrupt: - print("Interrupted") - # any different error - except RuntimeError as e: - print(e) - finally: - if player is not None: - # Stop capturing. - player.stop() - if use_popup: - cv2.destroyAllWindows() - -Run Live Object Detection -~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -Use a webcam as the video input. By default, the primary webcam is set -with \ ``source=0``. If you have multiple webcams, each one will be -assigned a consecutive number starting at 0. Set \ ``flip=True`` when -using a front-facing camera. Some web browsers, especially Mozilla -Firefox, may cause flickering. If you experience flickering, -set \ ``use_popup=True``. - - **NOTE**: To use this notebook with a webcam, you need to run the - notebook on a computer with a webcam. If you run the notebook on a - remote server (for example, in Binder or Google Colab service), the - webcam will not work. By default, the lower cell will run model - inference on a video file. If you want to try live inference on your - webcam set ``WEBCAM_INFERENCE = True`` - -Run the object detection: - -.. code:: ipython3 - - WEBCAM_INFERENCE = False - - if WEBCAM_INFERENCE: - VIDEO_SOURCE = 0 # Webcam - else: - VIDEO_URL = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/people.mp4" - VIDEO_SOURCE = Path("people.mp4") - if not VIDEO_SOURCE.exists(): - download_file(VIDEO_URL) - -.. code:: ipython3 - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - run_object_detection( - source=VIDEO_SOURCE, - flip=True, - use_popup=False, - model=det_ov_model, - device=device.value, - ) - - - -.. image:: yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_76_0.png - - -.. parsed-literal:: - - Source ended - diff --git a/docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_16_1.png b/docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_16_1.png deleted file mode 100644 index 703c1375a98784..00000000000000 --- a/docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_16_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e85f0434876997adc7cf2152fa667b4e271817a443a75ca350d284aee48f9145 -size 908034 diff --git a/docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_43_1.png b/docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_43_1.png deleted file mode 100644 index 44e7623ec662d1..00000000000000 --- a/docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_43_1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6778bdd31a39722d8ce3fb9ddea6383e451ab1b32cf23fad23b556bd89c99707 -size 907722 diff --git a/docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_70_0.png b/docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_70_0.png deleted file mode 100644 index 3b025ea9323f25..00000000000000 --- a/docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_70_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b44f35cf238dcd208fcebe73bb388947f5528a167b511c8098d040b05d9bb819 -size 931491 diff --git a/docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_76_0.png b/docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_76_0.png deleted file mode 100644 index 6d628536486d93..00000000000000 --- a/docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_76_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fc0d406144e3b07ae6bef629cd7b4973424847958b9865e7133bd0f0c7deeb5c -size 534154 diff --git a/docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_9_3.png b/docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_9_3.png deleted file mode 100644 index 5073fa9f3eca09..00000000000000 --- a/docs/notebooks/yolov8-object-detection-with-output_files/yolov8-object-detection-with-output_9_3.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:46d38bd078c6eeda706640d0e35907a9a504bf78f9dd8dbf2961de865c294802 -size 907783 diff --git a/docs/notebooks/yolov9-optimization-with-output.rst b/docs/notebooks/yolov9-optimization-with-output.rst deleted file mode 100644 index 73b6dc0395ce8f..00000000000000 --- a/docs/notebooks/yolov9-optimization-with-output.rst +++ /dev/null @@ -1,952 +0,0 @@ -Convert and Optimize YOLOv9 with OpenVINO™ -========================================== - -YOLOv9 marks a significant advancement in real-time object detection, -introducing groundbreaking techniques such as Programmable Gradient -Information (PGI) and the Generalized Efficient Layer Aggregation -Network (GELAN). This model demonstrates remarkable improvements in -efficiency, accuracy, and adaptability, setting new benchmarks on the MS -COCO dataset. More details about model can be found in -`paper `__ and `original -repository `__ This tutorial -demonstrates step-by-step instructions on how to run and optimize -PyTorch YOLO V9 with OpenVINO. - -The tutorial consists of the following steps: - -- Prepare PyTorch model - -- Convert PyTorch model to OpenVINO IR - -- Run model inference with OpenVINO - -- Prepare and run optimization pipeline - -- Compare performance of the FP32 and quantized models. - -- Run optimized model inference on video - -**Table of contents:** - -- `Prerequisites <#prerequisites>`__ - -- `Get PyTorch model <#get-pytorch-model>`__ - -- `Convert PyTorch model to OpenVINO - IR <#convert-pytorch-model-to-openvino-ir>`__ - -- `Verify model inference <#verify-model-inference>`__ - - - `Preprocessing <#preprocessing>`__ - - `Postprocessing <#postprocessing>`__ - - `Select inference device <#select-inference-device>`__ - -- `Optimize model using NNCF Post-training Quantization - API <#optimize-model-using-nncf-post-training-quantization-api>`__ - - - `Prepare dataset <#prepare-dataset>`__ - - `Perform model quantization <#perform-model-quantization>`__ - -- `Run quantized model inference <#run-quantized-model-inference>`__ - -- `Compare Performance of the Original and Quantized - Models <#compare-performance-of-the-original-and-quantized-models>`__ - -- `Run Live Object Detection <#run-live-object-detection>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -Prerequisites -------------- - - - -.. code:: ipython3 - - %pip install -q "openvino>=2023.3.0" "nncf>=2.8.1" "opencv-python" "matplotlib>=3.4" "seaborn" "pandas" "scikit-learn" "torch<2.6.0" "torchvision" "tqdm" --extra-index-url https://download.pytorch.org/whl/cpu - -.. code:: ipython3 - - from pathlib import Path - - # Fetch `notebook_utils` module - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - - open("notebook_utils.py", "w").write(r.text) - - if not Path("cmd_helper.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", - ) - open("cmd_helper.py", "w").write(r.text) - - from cmd_helper import clone_repo - from notebook_utils import download_file, VideoPlayer, device_widget - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("yolov9-optimization.ipynb") - - - clone_repo("https://github.com/WongKinYiu/yolov9", add_to_sys_path=False) - - %cd yolov9 - -Get PyTorch model ------------------ - - - -Generally, PyTorch models represent an instance of the -`torch.nn.Module `__ -class, initialized by a state dictionary with model weights. We will use -the ``gelan-c`` (light-weight version of yolov9) model pre-trained on a -COCO dataset, which is available in this -`repo `__, but the same steps are -applicable for other models from YOLO V9 family. - -.. code:: ipython3 - - # Download pre-trained model weights - MODEL_LINK = "https://github.com/WongKinYiu/yolov9/releases/download/v0.1/gelan-c.pt" - DATA_DIR = Path("data/") - MODEL_DIR = Path("model/") - MODEL_DIR.mkdir(exist_ok=True) - DATA_DIR.mkdir(exist_ok=True) - - download_file(MODEL_LINK, directory=MODEL_DIR, show_progress=True) - - -.. parsed-literal:: - - 'model/gelan-c.pt' already exists. - - - - -.. parsed-literal:: - - PosixPath('/home/ea/work/openvino_notebooks/notebooks/yolov9-optimization/yolov9/model/gelan-c.pt') - - - -Convert PyTorch model to OpenVINO IR ------------------------------------- - - - -OpenVINO supports PyTorch model conversion via Model Conversion API. -``ov.convert_model`` function accepts model object and example input for -tracing the model and returns an instance of ``ov.Model``, representing -this model in OpenVINO format. The Obtained model is ready for loading -on specific devices or can be saved on disk for the next deployment -using ``ov.save_model``. - -.. code:: ipython3 - - from models.experimental import attempt_load - import torch - import openvino as ov - from models.yolo import Detect, DualDDetect - from utils.general import yaml_save, yaml_load - - weights = MODEL_DIR / "gelan-c.pt" - ov_model_path = MODEL_DIR / weights.name.replace(".pt", "_openvino_model") / weights.name.replace(".pt", ".xml") - - if not ov_model_path.exists(): - model = attempt_load(weights, device="cpu", inplace=True, fuse=True) - metadata = {"stride": int(max(model.stride)), "names": model.names} - - model.eval() - for k, m in model.named_modules(): - if isinstance(m, (Detect, DualDDetect)): - m.inplace = False - m.dynamic = True - m.export = True - - example_input = torch.zeros((1, 3, 640, 640)) - model(example_input) - - ov_model = ov.convert_model(model, example_input=example_input) - - # specify input and output names for compatibility with yolov9 repo interface - ov_model.outputs[0].get_tensor().set_names({"output0"}) - ov_model.inputs[0].get_tensor().set_names({"images"}) - ov.save_model(ov_model, ov_model_path) - # save metadata - yaml_save(ov_model_path.parent / weights.name.replace(".pt", ".yaml"), metadata) - else: - metadata = yaml_load(ov_model_path.parent / weights.name.replace(".pt", ".yaml")) - -Verify model inference ----------------------- - - - -To test model work, we create inference pipeline similar to -``detect.py``. The pipeline consists of preprocessing step, inference of -OpenVINO model, and results post-processing to get bounding boxes. - -Preprocessing -~~~~~~~~~~~~~ - - - -Model input is a tensor with the ``[1, 3, 640, 640]`` shape in -``N, C, H, W`` format, where - -- ``N`` - number of images in batch (batch size) -- ``C`` - image channels -- ``H`` - image height -- ``W`` - image width - -Model expects images in RGB channels format and normalized in [0, 1] -range. To resize images to fit model size ``letterbox`` resize approach -is used where the aspect ratio of width and height is preserved. It is -defined in yolov9 repository. - -To keep specific shape, preprocessing automatically enables padding. - -.. code:: ipython3 - - import numpy as np - import torch - from PIL import Image - from utils.augmentations import letterbox - - image_url = "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/7b6af406-4ccb-4ded-a13d-62b7c0e42e96" - if not Path("test_image.jpg").exists(): - download_file(image_url, directory=DATA_DIR, filename="test_image.jpg", show_progress=True) - - - def preprocess_image(img0: np.ndarray): - """ - Preprocess image according to YOLOv9 input requirements. - Takes image in np.array format, resizes it to specific size using letterbox resize, converts color space from BGR (default in OpenCV) to RGB and changes data layout from HWC to CHW. - - Parameters: - img0 (np.ndarray): image for preprocessing - Returns: - img (np.ndarray): image after preprocessing - img0 (np.ndarray): original image - """ - # resize - img = letterbox(img0, auto=False)[0] - - # Convert - img = img.transpose(2, 0, 1) - img = np.ascontiguousarray(img) - return img, img0 - - - def prepare_input_tensor(image: np.ndarray): - """ - Converts preprocessed image to tensor format according to YOLOv9 input requirements. - Takes image in np.array format with unit8 data in [0, 255] range and converts it to torch.Tensor object with float data in [0, 1] range - - Parameters: - image (np.ndarray): image for conversion to tensor - Returns: - input_tensor (torch.Tensor): float tensor ready to use for YOLOv9 inference - """ - input_tensor = image.astype(np.float32) # uint8 to fp16/32 - input_tensor /= 255.0 # 0 - 255 to 0.0 - 1.0 - - if input_tensor.ndim == 3: - input_tensor = np.expand_dims(input_tensor, 0) - return input_tensor - - - NAMES = metadata["names"] - - -.. parsed-literal:: - - 'data/test_image.jpg' already exists. - - -Postprocessing -~~~~~~~~~~~~~~ - - - -Model output contains detection boxes candidates. It is a tensor with -the ``[1,25200,85]`` shape in the ``B, N, 85`` format, where: - -- ``B`` - batch size -- ``N`` - number of detection boxes - -Detection box has the [``x``, ``y``, ``h``, ``w``, ``box_score``, -``class_no_1``, …, ``class_no_80``] format, where: - -- (``x``, ``y``) - raw coordinates of box center -- ``h``, ``w`` - raw height and width of box -- ``box_score`` - confidence of detection box -- ``class_no_1``, …, ``class_no_80`` - probability distribution over - the classes. - -For getting final prediction, we need to apply non maximum suppression -algorithm and rescale boxes coordinates to original image size. - -.. code:: ipython3 - - from utils.plots import Annotator, colors - - from typing import List, Tuple - from utils.general import scale_boxes, non_max_suppression - - - def detect( - model: ov.Model, - image_path: Path, - conf_thres: float = 0.25, - iou_thres: float = 0.45, - classes: List[int] = None, - agnostic_nms: bool = False, - ): - """ - OpenVINO YOLOv9 model inference function. Reads image, preprocess it, runs model inference and postprocess results using NMS. - Parameters: - model (Model): OpenVINO compiled model. - image_path (Path): input image path. - conf_thres (float, *optional*, 0.25): minimal accepted confidence for object filtering - iou_thres (float, *optional*, 0.45): minimal overlap score for removing objects duplicates in NMS - classes (List[int], *optional*, None): labels for prediction filtering, if not provided all predicted labels will be used - agnostic_nms (bool, *optional*, False): apply class agnostic NMS approach or not - Returns: - pred (List): list of detections with (n,6) shape, where n - number of detected boxes in format [x1, y1, x2, y2, score, label] - orig_img (np.ndarray): image before preprocessing, can be used for results visualization - inpjut_shape (Tuple[int]): shape of model input tensor, can be used for output rescaling - """ - if isinstance(image_path, np.ndarray): - img = image_path - else: - img = np.array(Image.open(image_path)) - preprocessed_img, orig_img = preprocess_image(img) - input_tensor = prepare_input_tensor(preprocessed_img) - predictions = torch.from_numpy(model(input_tensor)[0]) - pred = non_max_suppression(predictions, conf_thres, iou_thres, classes=classes, agnostic=agnostic_nms) - return pred, orig_img, input_tensor.shape - - - def draw_boxes( - predictions: np.ndarray, - input_shape: Tuple[int], - image: np.ndarray, - names: List[str], - ): - """ - Utility function for drawing predicted bounding boxes on image - Parameters: - predictions (np.ndarray): list of detections with (n,6) shape, where n - number of detected boxes in format [x1, y1, x2, y2, score, label] - image (np.ndarray): image for boxes visualization - names (List[str]): list of names for each class in dataset - colors (Dict[str, int]): mapping between class name and drawing color - Returns: - image (np.ndarray): box visualization result - """ - if not len(predictions): - return image - - annotator = Annotator(image, line_width=1, example=str(names)) - # Rescale boxes from input size to original image size - predictions[:, :4] = scale_boxes(input_shape[2:], predictions[:, :4], image.shape).round() - - # Write results - for *xyxy, conf, cls in reversed(predictions): - label = f"{names[int(cls)]} {conf:.2f}" - annotator.box_label(xyxy, label, color=colors(int(cls), True)) - return image - -.. code:: ipython3 - - core = ov.Core() - # read converted model - ov_model = core.read_model(ov_model_path) - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - # load model on selected device - if device.value != "CPU": - ov_model.reshape({0: [1, 3, 640, 640]}) - compiled_model = core.compile_model(ov_model, device.value) - -.. code:: ipython3 - - boxes, image, input_shape = detect(compiled_model, DATA_DIR / "test_image.jpg") - image_with_boxes = draw_boxes(boxes[0], input_shape, image, NAMES) - # visualize results - Image.fromarray(image_with_boxes) - - - - -.. image:: yolov9-optimization-with-output_files/yolov9-optimization-with-output_16_0.png - - - -Optimize model using NNCF Post-training Quantization API --------------------------------------------------------- - - - -`NNCF `__ provides a suite of -advanced algorithms for Neural Networks inference optimization in -OpenVINO with minimal accuracy drop. We will use 8-bit quantization in -post-training mode (without the fine-tuning pipeline) to optimize -YOLOv9. The optimization process contains the following steps: - -1. Create a Dataset for quantization. -2. Run ``nncf.quantize`` for getting an optimized model. -3. Serialize an OpenVINO IR model, using the ``ov.save_model`` function. - -Prepare dataset -~~~~~~~~~~~~~~~ - - - -The code below downloads COCO dataset and prepares a dataloader that is -used to evaluate the yolov9 model accuracy. We reuse its subset for -quantization. - -.. code:: ipython3 - - from zipfile import ZipFile - - - DATA_URL = "http://images.cocodataset.org/zips/val2017.zip" - LABELS_URL = "https://github.com/ultralytics/yolov5/releases/download/v1.0/coco2017labels-segments.zip" - - OUT_DIR = Path(".") - - if not (OUT_DIR / "coco/labels").exists(): - download_file(DATA_URL, directory=OUT_DIR, show_progress=True) - download_file(LABELS_URL, directory=OUT_DIR, show_progress=True) - with ZipFile("coco2017labels-segments.zip", "r") as zip_ref: - zip_ref.extractall(OUT_DIR) - with ZipFile("val2017.zip", "r") as zip_ref: - zip_ref.extractall(OUT_DIR / "coco/images") - - -.. parsed-literal:: - - 'val2017.zip' already exists. - 'coco2017labels-segments.zip' already exists. - - -.. code:: ipython3 - - from collections import namedtuple - import yaml - from utils.dataloaders import create_dataloader - from utils.general import colorstr - - # read dataset config - DATA_CONFIG = "data/coco.yaml" - with open(DATA_CONFIG) as f: - data = yaml.load(f, Loader=yaml.SafeLoader) - - # Dataloader - TASK = "val" # path to train/val/test images - Option = namedtuple("Options", ["single_cls"]) # imitation of commandline provided options for single class evaluation - opt = Option(False) - dataloader = create_dataloader( - str(Path("coco") / data[TASK]), - 640, - 1, - 32, - opt, - pad=0.5, - prefix=colorstr(f"{TASK}: "), - )[0] - - -.. parsed-literal:: - - val: Scanning coco/val2017.cache... 4952 images, 48 backgrounds, 0 corrupt: 100%|██████████| 5000/5000 00:00 - - -NNCF provides ``nncf.Dataset`` wrapper for using native framework -dataloaders in quantization pipeline. Additionally, we specify transform -function that will be responsible for preparing input data in model -expected format. - -.. code:: ipython3 - - import nncf - - - def transform_fn(data_item): - """ - Quantization transform function. Extracts and preprocess input data from dataloader item for quantization. - Parameters: - data_item: Tuple with data item produced by DataLoader during iteration - Returns: - input_tensor: Input data for quantization - """ - img = data_item[0].numpy() - input_tensor = prepare_input_tensor(img) - return input_tensor - - - quantization_dataset = nncf.Dataset(dataloader, transform_fn) - - -.. parsed-literal:: - - INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino - - -Perform model quantization -~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - -The ``nncf.quantize`` function provides an interface for model -quantization. It requires an instance of the OpenVINO Model and -quantization dataset. Optionally, some additional parameters for the -configuration quantization process (number of samples for quantization, -preset, ignored scope etc.) can be provided. YOLOv9 model contains -non-ReLU activation functions, which require asymmetric quantization of -activations. To achieve better results, we will use a ``mixed`` -quantization preset. It provides symmetric quantization of weights and -asymmetric quantization of activations. - -.. code:: ipython3 - - ov_int8_model_path = MODEL_DIR / weights.name.replace(".pt", "_int8_openvino_model") / weights.name.replace(".pt", "_int8.xml") - - if not ov_int8_model_path.exists(): - quantized_model = nncf.quantize(ov_model, quantization_dataset, preset=nncf.QuantizationPreset.MIXED) - - ov.save_model(quantized_model, ov_int8_model_path) - yaml_save(ov_int8_model_path.parent / weights.name.replace(".pt", "_int8.yaml"), metadata) - -Run quantized model inference ------------------------------ - - - -There are no changes in model usage after applying quantization. Let’s -check the model work on the previously used image. - -.. code:: ipython3 - - quantized_model = core.read_model(ov_int8_model_path) - - if device.value != "CPU": - quantized_model.reshape({0: [1, 3, 640, 640]}) - - compiled_model = core.compile_model(quantized_model, device.value) - -.. code:: ipython3 - - boxes, image, input_shape = detect(compiled_model, DATA_DIR / "test_image.jpg") - image_with_boxes = draw_boxes(boxes[0], input_shape, image, NAMES) - # visualize results - Image.fromarray(image_with_boxes) - - - - -.. image:: yolov9-optimization-with-output_files/yolov9-optimization-with-output_27_0.png - - - -Compare Performance of the Original and Quantized Models --------------------------------------------------------- - - - -We use the OpenVINO `Benchmark -Tool `__ -to measure the inference performance of the ``FP32`` and ``INT8`` -models. - - **NOTE**: For more accurate performance, it is recommended to run - ``benchmark_app`` in a terminal/command prompt after closing other - applications. Run ``benchmark_app -m model.xml -d CPU`` to benchmark - async inference on CPU for one minute. Change ``CPU`` to ``GPU`` to - benchmark on GPU. Run ``benchmark_app --help`` to see an overview of - all command-line options. - -.. code:: ipython3 - - !benchmark_app -m $ov_model_path -shape "[1,3,640,640]" -d $device.value -api async -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.0.0-14509-34caeefd078-releases/2024/0 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.0.0-14509-34caeefd078-releases/2024/0 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 34.88 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] images (node: x) : f32 / [...] / [?,3,?,?] - [ INFO ] Model outputs: - [ INFO ] output0 (node: __module.model.22/aten::cat/Concat_5) : f32 / [...] / [?,84,8400] - [ INFO ] xi.1 (node: __module.model.22/aten::cat/Concat_2) : f32 / [...] / [?,144,4..,4..] - [ INFO ] xi.3 (node: __module.model.22/aten::cat/Concat_1) : f32 / [...] / [?,144,2..,2..] - [ INFO ] xi (node: __module.model.22/aten::cat/Concat) : f32 / [...] / [?,144,1..,1..] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'images': [1,3,640,640] - [ INFO ] Reshape model took 10.57 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] images (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] output0 (node: __module.model.22/aten::cat/Concat_5) : f32 / [...] / [1,84,8400] - [ INFO ] xi.1 (node: __module.model.22/aten::cat/Concat_2) : f32 / [...] / [1,144,80,80] - [ INFO ] xi.3 (node: __module.model.22/aten::cat/Concat_1) : f32 / [...] / [1,144,40,40] - [ INFO ] xi (node: __module.model.22/aten::cat/Concat) : f32 / [...] / [1,144,20,20] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 810.84 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 0 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'images'!. This input will be filled with random values! - [ INFO ] Fill input 'images' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 15000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 267.81 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 276 iterations - [ INFO ] Duration: 15758.17 ms - [ INFO ] Latency: - [ INFO ] Median: 675.43 ms - [ INFO ] Average: 677.33 ms - [ INFO ] Min: 437.98 ms - [ INFO ] Max: 813.27 ms - [ INFO ] Throughput: 17.51 FPS - - -.. code:: ipython3 - - !benchmark_app -m $ov_int8_model_path -shape "[1,3,640,640]" -d $device.value -api async -t 15 - - -.. parsed-literal:: - - [Step 1/11] Parsing and validating input arguments - [ INFO ] Parsing input parameters - [Step 2/11] Loading OpenVINO Runtime - [ INFO ] OpenVINO: - [ INFO ] Build ................................. 2024.0.0-14509-34caeefd078-releases/2024/0 - [ INFO ] - [ INFO ] Device info: - [ INFO ] AUTO - [ INFO ] Build ................................. 2024.0.0-14509-34caeefd078-releases/2024/0 - [ INFO ] - [ INFO ] - [Step 3/11] Setting device configuration - [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. - [Step 4/11] Reading model files - [ INFO ] Loading model files - [ INFO ] Read model took 74.87 ms - [ INFO ] Original model I/O parameters: - [ INFO ] Model inputs: - [ INFO ] images (node: x) : f32 / [...] / [?,3,?,?] - [ INFO ] Model outputs: - [ INFO ] output0 (node: __module.model.22/aten::cat/Concat_5) : f32 / [...] / [?,84,8400] - [ INFO ] xi.1 (node: __module.model.22/aten::cat/Concat_2) : f32 / [...] / [?,144,4..,4..] - [ INFO ] xi.3 (node: __module.model.22/aten::cat/Concat_1) : f32 / [...] / [?,144,2..,2..] - [ INFO ] xi (node: __module.model.22/aten::cat/Concat) : f32 / [...] / [?,144,1..,1..] - [Step 5/11] Resizing model to match image sizes and given batch - [ INFO ] Model batch size: 1 - [ INFO ] Reshaping model: 'images': [1,3,640,640] - [ INFO ] Reshape model took 21.00 ms - [Step 6/11] Configuring input of the model - [ INFO ] Model inputs: - [ INFO ] images (node: x) : u8 / [N,C,H,W] / [1,3,640,640] - [ INFO ] Model outputs: - [ INFO ] output0 (node: __module.model.22/aten::cat/Concat_5) : f32 / [...] / [1,84,8400] - [ INFO ] xi.1 (node: __module.model.22/aten::cat/Concat_2) : f32 / [...] / [1,144,80,80] - [ INFO ] xi.3 (node: __module.model.22/aten::cat/Concat_1) : f32 / [...] / [1,144,40,40] - [ INFO ] xi (node: __module.model.22/aten::cat/Concat) : f32 / [...] / [1,144,20,20] - [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 1577.10 ms - [Step 8/11] Querying optimal runtime parameters - [ INFO ] Model: - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] PERFORMANCE_HINT: PerformanceMode.THROUGHPUT - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] MULTI_DEVICE_PRIORITIES: CPU - [ INFO ] CPU: - [ INFO ] AFFINITY: Affinity.CORE - [ INFO ] CPU_DENORMALS_OPTIMIZATION: False - [ INFO ] CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0 - [ INFO ] DYNAMIC_QUANTIZATION_GROUP_SIZE: 0 - [ INFO ] ENABLE_CPU_PINNING: True - [ INFO ] ENABLE_HYPER_THREADING: True - [ INFO ] EXECUTION_DEVICES: ['CPU'] - [ INFO ] EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE - [ INFO ] INFERENCE_NUM_THREADS: 36 - [ INFO ] INFERENCE_PRECISION_HINT: - [ INFO ] KV_CACHE_PRECISION: - [ INFO ] LOG_LEVEL: Level.NO - [ INFO ] NETWORK_NAME: Model0 - [ INFO ] NUM_STREAMS: 12 - [ INFO ] OPTIMAL_NUMBER_OF_INFER_REQUESTS: 12 - [ INFO ] PERFORMANCE_HINT: THROUGHPUT - [ INFO ] PERFORMANCE_HINT_NUM_REQUESTS: 0 - [ INFO ] PERF_COUNT: NO - [ INFO ] SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE - [ INFO ] MODEL_PRIORITY: Priority.MEDIUM - [ INFO ] LOADED_FROM_CACHE: False - [Step 9/11] Creating infer requests and preparing input tensors - [ WARNING ] No input files were given for input 'images'!. This input will be filled with random values! - [ INFO ] Fill input 'images' with random values - [Step 10/11] Measuring performance (Start inference asynchronously, 12 inference requests, limits: 15000 ms duration) - [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 97.75 ms - [Step 11/11] Dumping statistics report - [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 948 iterations - [ INFO ] Duration: 15275.99 ms - [ INFO ] Latency: - [ INFO ] Median: 189.64 ms - [ INFO ] Average: 192.41 ms - [ INFO ] Min: 116.33 ms - [ INFO ] Max: 302.97 ms - [ INFO ] Throughput: 62.06 FPS - - -Run Live Object Detection -------------------------- - - - -.. code:: ipython3 - - import collections - import time - from IPython import display - import cv2 - - - # Main processing function to run object detection. - def run_object_detection( - source=0, - flip=False, - use_popup=False, - skip_first_frames=0, - model=ov_model, - device=device.value, - ): - player = None - compiled_model = core.compile_model(model, device) - try: - # Create a video player to play with target fps. - player = VideoPlayer(source=source, flip=flip, fps=30, skip_first_frames=skip_first_frames) - # Start capturing. - player.start() - if use_popup: - title = "Press ESC to Exit" - cv2.namedWindow(winname=title, flags=cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE) - - processing_times = collections.deque() - while True: - # Grab the frame. - frame = player.next() - if frame is None: - print("Source ended") - break - # If the frame is larger than full HD, reduce size to improve the performance. - scale = 1280 / max(frame.shape) - if scale < 1: - frame = cv2.resize( - src=frame, - dsize=None, - fx=scale, - fy=scale, - interpolation=cv2.INTER_AREA, - ) - # Get the results. - input_image = np.array(frame) - - start_time = time.time() - # model expects RGB image, while video capturing in BGR - detections, _, input_shape = detect(compiled_model, input_image[:, :, ::-1]) - stop_time = time.time() - - image_with_boxes = draw_boxes(detections[0], input_shape, input_image, NAMES) - frame = image_with_boxes - - processing_times.append(stop_time - start_time) - # Use processing times from last 200 frames. - if len(processing_times) > 200: - processing_times.popleft() - - _, f_width = frame.shape[:2] - # Mean processing time [ms]. - processing_time = np.mean(processing_times) * 1000 - fps = 1000 / processing_time - cv2.putText( - img=frame, - text=f"Inference time: {processing_time:.1f}ms ({fps:.1f} FPS)", - org=(20, 40), - fontFace=cv2.FONT_HERSHEY_COMPLEX, - fontScale=f_width / 1000, - color=(0, 0, 255), - thickness=1, - lineType=cv2.LINE_AA, - ) - # Use this workaround if there is flickering. - if use_popup: - cv2.imshow(winname=title, mat=frame) - key = cv2.waitKey(1) - # escape = 27 - if key == 27: - break - else: - # Encode numpy array to jpg. - _, encoded_img = cv2.imencode(ext=".jpg", img=frame, params=[cv2.IMWRITE_JPEG_QUALITY, 100]) - # Create an IPython image.⬆️ - i = display.Image(data=encoded_img) - # Display the image in this notebook. - display.clear_output(wait=True) - display.display(i) - # ctrl-c - except KeyboardInterrupt: - print("Interrupted") - # any different error - except RuntimeError as e: - print(e) - finally: - if player is not None: - # Stop capturing. - player.stop() - if use_popup: - cv2.destroyAllWindows() - -Use a webcam as the video input. By default, the primary webcam is set -with ``source=0``. If you have multiple webcams, each one will be -assigned a consecutive number starting at 0. Set ``flip=True`` when -using a front-facing camera. Some web browsers, especially Mozilla -Firefox, may cause flickering. If you experience flickering, -set ``use_popup=True``. - - **NOTE**: To use this notebook with a webcam, you need to run the - notebook on a computer with a webcam. If you run the notebook on a - remote server (for example, in Binder or Google Colab service), the - webcam will not work. By default, the lower cell will run model - inference on a video file. If you want to try live inference on your - webcam set ``WEBCAM_INFERENCE = True`` - -Run the object detection: - -.. code:: ipython3 - - WEBCAM_INFERENCE = False - - if WEBCAM_INFERENCE: - VIDEO_SOURCE = 0 # Webcam - else: - VIDEO_URL = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/people.mp4" - VIDEO_SOURCE = Path("people.mp4") - if not VIDEO_SOURCE.exists(): - download_file(VIDEO_URL) - -.. code:: ipython3 - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - quantized_model = core.read_model(ov_int8_model_path) - - run_object_detection( - source=VIDEO_SOURCE, - flip=True, - use_popup=False, - model=quantized_model, - device=device.value, - ) diff --git a/docs/notebooks/yolov9-optimization-with-output_files/yolov9-optimization-with-output_16_0.jpg b/docs/notebooks/yolov9-optimization-with-output_files/yolov9-optimization-with-output_16_0.jpg deleted file mode 100644 index b4c8b93f73a5cc..00000000000000 --- a/docs/notebooks/yolov9-optimization-with-output_files/yolov9-optimization-with-output_16_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:054998978d57b209ef21838dbf9ff431aac04bde1dd1fa69c721a0a22e028774 -size 52845 diff --git a/docs/notebooks/yolov9-optimization-with-output_files/yolov9-optimization-with-output_16_0.png b/docs/notebooks/yolov9-optimization-with-output_files/yolov9-optimization-with-output_16_0.png deleted file mode 100644 index f9fc498584b483..00000000000000 --- a/docs/notebooks/yolov9-optimization-with-output_files/yolov9-optimization-with-output_16_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:40d5c7c1bfc0a3695451e87efe8c19b13cf66036efb226479466044c42ede189 -size 530382 diff --git a/docs/notebooks/yolov9-optimization-with-output_files/yolov9-optimization-with-output_27_0.jpg b/docs/notebooks/yolov9-optimization-with-output_files/yolov9-optimization-with-output_27_0.jpg deleted file mode 100644 index b6b9f9ab78a982..00000000000000 --- a/docs/notebooks/yolov9-optimization-with-output_files/yolov9-optimization-with-output_27_0.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c65dd96351a2b4bfe332adf10f82538f78eacd7b02ed96d95d3a7d50b96391e0 -size 53039 diff --git a/docs/notebooks/yolov9-optimization-with-output_files/yolov9-optimization-with-output_27_0.png b/docs/notebooks/yolov9-optimization-with-output_files/yolov9-optimization-with-output_27_0.png deleted file mode 100644 index 4d61a9b41eabf7..00000000000000 --- a/docs/notebooks/yolov9-optimization-with-output_files/yolov9-optimization-with-output_27_0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3aa917b35074c2ea1fc680168ed3be650edc692b570d62466429860ae97e67a0 -size 531122 diff --git a/docs/notebooks/zeroscope-text2video-with-output.rst b/docs/notebooks/zeroscope-text2video-with-output.rst deleted file mode 100644 index d3dfc27db5852e..00000000000000 --- a/docs/notebooks/zeroscope-text2video-with-output.rst +++ /dev/null @@ -1,888 +0,0 @@ -Video generation with ZeroScope and OpenVINO -============================================ - - -**Table of contents:** - - -- `Install and import required - packages <#install-and-import-required-packages>`__ -- `Load the model <#load-the-model>`__ -- `Convert the model <#convert-the-model>`__ - - - `Define the conversion - function <#define-the-conversion-function>`__ - - `UNet <#unet>`__ - - `VAE <#vae>`__ - - `Text encoder <#text-encoder>`__ - -- `Build a pipeline <#build-a-pipeline>`__ -- `Inference with OpenVINO <#inference-with-openvino>`__ - - - `Select inference device <#select-inference-device>`__ - - `Define a prompt <#define-a-prompt>`__ - - `Video generation <#video-generation>`__ - -- `Interactive demo <#interactive-demo>`__ - -Installation Instructions -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is a self-contained example that relies solely on its own code. - -We recommend running the notebook in a virtual environment. You only -need a Jupyter server to start. For details, please refer to -`Installation -Guide `__. - -The ZeroScope model is a free and open-source text-to-video model that -can generate realistic and engaging videos from text descriptions. It is -based on the -`Modelscope `__ -model, but it has been improved to produce higher-quality videos with a -16:9 aspect ratio and no Shutterstock watermark. The ZeroScope model is -available in two versions: ZeroScope_v2 576w, which is optimized for -rapid content creation at a resolution of 576x320 pixels, and -ZeroScope_v2 XL, which upscales videos to a high-definition resolution -of 1024x576. - -The ZeroScope model is trained on a dataset of over 9,000 videos and -29,000 tagged frames. It uses a diffusion model to generate videos, -which means that it starts with a random noise image and gradually adds -detail to it until it matches the text description. The ZeroScope model -is still under development, but it has already been used to create some -impressive videos. For example, it has been used to create videos of -people dancing, playing sports, and even driving cars. - -The ZeroScope model is a powerful tool that can be used to create -various videos, from simple animations to complex scenes. It is still -under development, but it has the potential to revolutionize the way we -create and consume video content. - -Both versions of the ZeroScope model are available on Hugging Face: - -- `ZeroScope_v2 576w `__ -- `ZeroScope_v2 XL `__ - -We will use the first one. - -.. warning:: - - :: - - This tutorial requires at least 24GB of free memory to generate a video with a frame size of 432x240 and 16 frames. Increasing either of these values will require more memory and take more time. - -Install and import required packages ------------------------------------- - - - -To work with text-to-video synthesis model, we will use Hugging Face’s -`Diffusers `__ library. It -provides already pretrained model from ``cerspense``. - -.. code:: ipython3 - - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "diffusers>=0.18.0" "torch>=2.1" transformers "openvino>=2023.1.0" numpy "gradio>=4.19" - -.. code:: ipython3 - - import gc - from typing import Optional, Union, List, Callable - import base64 - import tempfile - import warnings - from pathlib import Path - - import diffusers - import transformers - import numpy as np - import IPython - import torch - import PIL - import gradio as gr - - import openvino as ov - - import requests - - if not Path("notebook_utils.py").exists(): - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) - - # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry - from notebook_utils import collect_telemetry - - collect_telemetry("zeroscope-text2video.ipynb") - -Original 576x320 inference requires a lot of RAM (>100GB), so let’s run -our example on a smaller frame size, keeping the same aspect ratio. Try -reducing values below to reduce the memory consumption. - -.. code:: ipython3 - - WIDTH = 432 # must be divisible by 8 - HEIGHT = 240 # must be divisible by 8 - NUM_FRAMES = 16 - -Load the model --------------- - - - -The model is loaded from HuggingFace using ``.from_pretrained`` method -of ``diffusers.DiffusionPipeline``. - -.. code:: ipython3 - - pipe = diffusers.DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_576w") - - -.. parsed-literal:: - - unet/diffusion_pytorch_model.safetensors not found - - - -.. parsed-literal:: - - Loading pipeline components...: 0%| | 0/5 [00:00 Path: - xml_path = Path(xml_path) - if not xml_path.exists(): - xml_path.parent.mkdir(parents=True, exist_ok=True) - with torch.no_grad(): - converted_model = ov.convert_model(model, **convert_kwargs) - ov.save_model(converted_model, xml_path) - del converted_model - gc.collect() - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - return xml_path - -UNet -~~~~ - - - -Text-to-video generation pipeline main component is a conditional 3D -UNet model that takes a noisy sample, conditional state, and a timestep -and returns a sample shaped output. - -.. code:: ipython3 - - unet_xml_path = convert( - unet, - "models/unet.xml", - example_input={ - "sample": torch.randn(2, 4, 2, int(sample_height // 2), int(sample_width // 2)), - "timestep": torch.tensor(1), - "encoder_hidden_states": torch.randn(2, 77, 1024), - }, - input=[ - ("sample", (2, 4, NUM_FRAMES, sample_height, sample_width)), - ("timestep", ()), - ("encoder_hidden_states", (2, 77, 1024)), - ], - ) - del unet - gc.collect(); - -VAE -~~~ - - - -Variational autoencoder (VAE) uses UNet output to decode latents to -visual representations. Our VAE model has KL loss for encoding images -into latents and decoding latent representations into images. For -inference, we need only decoder part. - -.. code:: ipython3 - - class VaeDecoderWrapper(torch.nn.Module): - def __init__(self, vae): - super().__init__() - self.vae = vae - - def forward(self, z: torch.FloatTensor): - return self.vae.decode(z) - -.. code:: ipython3 - - vae_decoder_xml_path = convert( - VaeDecoderWrapper(vae), - "models/vae.xml", - example_input=torch.randn(2, 4, 32, 32), - input=((NUM_FRAMES, 4, sample_height, sample_width)), - ) - del vae - gc.collect(); - -Text encoder -~~~~~~~~~~~~ - - - -Text encoder is used to encode the input prompt to tensor. Default -tensor length is 77. - -.. code:: ipython3 - - text_encoder_xml = convert( - text_encoder, - "models/text_encoder.xml", - example_input=torch.ones(1, 77, dtype=torch.int64), - input=((1, 77), ov.Type.i64), - ) - del text_encoder - gc.collect(); - -Build a pipeline ----------------- - - - -.. code:: ipython3 - - def tensor2vid(video: torch.Tensor, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) -> List[np.ndarray]: - # This code is copied from https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78 - # reshape to ncfhw - mean = torch.tensor(mean, device=video.device).reshape(1, -1, 1, 1, 1) - std = torch.tensor(std, device=video.device).reshape(1, -1, 1, 1, 1) - # unnormalize back to [0,1] - video = video.mul_(std).add_(mean) - video.clamp_(0, 1) - # prepare the final outputs - i, c, f, h, w = video.shape - images = video.permute(2, 3, 0, 4, 1).reshape(f, h, i * w, c) # 1st (frames, h, batch_size, w, c) 2nd (frames, h, batch_size * w, c) - images = images.unbind(dim=0) # prepare a list of indvidual (consecutive frames) - images = [(image.cpu().numpy() * 255).astype("uint8") for image in images] # f h w c - return images - -.. code:: ipython3 - - try: - from diffusers.utils import randn_tensor - except ImportError: - from diffusers.utils.torch_utils import randn_tensor - - - class OVTextToVideoSDPipeline(diffusers.DiffusionPipeline): - def __init__( - self, - vae_decoder: ov.CompiledModel, - text_encoder: ov.CompiledModel, - tokenizer: transformers.CLIPTokenizer, - unet: ov.CompiledModel, - scheduler: diffusers.schedulers.DDIMScheduler, - ): - super().__init__() - - self.vae_decoder = vae_decoder - self.text_encoder = text_encoder - self.tokenizer = tokenizer - self.unet = unet - self.scheduler = scheduler - self.vae_scale_factor = vae_scale_factor - self.unet_in_channels = unet_in_channels - self.width = WIDTH - self.height = HEIGHT - self.num_frames = NUM_FRAMES - - def __call__( - self, - prompt: Union[str, List[str]] = None, - num_inference_steps: int = 50, - guidance_scale: float = 9.0, - negative_prompt: Optional[Union[str, List[str]]] = None, - eta: float = 0.0, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - output_type: Optional[str] = "np", - return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, - callback_steps: int = 1, - ): - r""" - Function invoked when calling the pipeline for generation. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide the video generation. If not defined, one has to pass `prompt_embeds`. - instead. - num_inference_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality videos at the - expense of slower inference. - guidance_scale (`float`, *optional*, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate videos that are closely linked to the text `prompt`, - usually at the expense of lower video quality. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the video generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - eta (`float`, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) - to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. Latents should be of shape - `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - output_type (`str`, *optional*, defaults to `"np"`): - The output format of the generate video. Choose between `torch.FloatTensor` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.TextToVideoSDPipelineOutput`] instead of a - plain tuple. - callback (`Callable`, *optional*): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, *optional*, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - - Returns: - `List[np.ndarray]`: generated video frames - """ - - num_images_per_prompt = 1 - - # 1. Check inputs. Raise error if not correct - self.check_inputs( - prompt, - callback_steps, - negative_prompt, - prompt_embeds, - negative_prompt_embeds, - ) - - # 2. Define call parameters - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - # 3. Encode input prompt - prompt_embeds = self._encode_prompt( - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - ) - - # 4. Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps) - timesteps = self.scheduler.timesteps - - # 5. Prepare latent variables - num_channels_latents = self.unet_in_channels - latents = self.prepare_latents( - batch_size * num_images_per_prompt, - num_channels_latents, - prompt_embeds.dtype, - generator, - latents, - ) - - # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - extra_step_kwargs = {"generator": generator, "eta": eta} - - # 7. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - - # predict the noise residual - noise_pred = self.unet( - { - "sample": latent_model_input, - "timestep": t, - "encoder_hidden_states": prompt_embeds, - } - )[0] - noise_pred = torch.tensor(noise_pred) - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # reshape latents - bsz, channel, frames, width, height = latents.shape - latents = latents.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height) - noise_pred = noise_pred.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample - - # reshape latents back - latents = latents[None, :].reshape(bsz, frames, channel, width, height).permute(0, 2, 1, 3, 4) - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - if callback is not None and i % callback_steps == 0: - callback(i, t, latents) - - video_tensor = self.decode_latents(latents) - - if output_type == "pt": - video = video_tensor - else: - video = tensor2vid(video_tensor) - - if not return_dict: - return (video,) - - return {"frames": video} - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt - def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - ): - r""" - Encodes the prompt into text encoder hidden states. - - Args: - prompt (`str` or `List[str]`, *optional*): - prompt to be encoded - num_images_per_prompt (`int`): - number of images that should be generated per prompt - do_classifier_free_guidance (`bool`): - whether to use classifier free guidance or not - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - """ - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if prompt_embeds is None: - text_inputs = self.tokenizer( - prompt, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids - - if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids): - removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]) - print( - "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}" - ) - - prompt_embeds = self.text_encoder(text_input_ids) - prompt_embeds = prompt_embeds[0] - prompt_embeds = torch.tensor(prompt_embeds) - - bs_embed, seq_len, _ = prompt_embeds.shape - # duplicate text embeddings for each generation per prompt, using mps friendly method - prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) - prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) - - # get unconditional embeddings for classifier free guidance - if do_classifier_free_guidance and negative_prompt_embeds is None: - uncond_tokens: List[str] - if negative_prompt is None: - uncond_tokens = [""] * batch_size - elif type(prompt) is not type(negative_prompt): - raise TypeError(f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" f" {type(prompt)}.") - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] - elif batch_size != len(negative_prompt): - raise ValueError( - f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" - f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`." - ) - else: - uncond_tokens = negative_prompt - - max_length = prompt_embeds.shape[1] - uncond_input = self.tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="pt", - ) - - negative_prompt_embeds = self.text_encoder(uncond_input.input_ids) - negative_prompt_embeds = negative_prompt_embeds[0] - negative_prompt_embeds = torch.tensor(negative_prompt_embeds) - - if do_classifier_free_guidance: - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - seq_len = negative_prompt_embeds.shape[1] - - negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) - negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) - - return prompt_embeds - - def prepare_latents( - self, - batch_size, - num_channels_latents, - dtype, - generator, - latents=None, - ): - shape = ( - batch_size, - num_channels_latents, - self.num_frames, - self.height // self.vae_scale_factor, - self.width // self.vae_scale_factor, - ) - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - - if latents is None: - latents = randn_tensor(shape, generator=generator, dtype=dtype) - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * self.scheduler.init_noise_sigma - return latents - - def check_inputs( - self, - prompt, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, - ): - if self.height % 8 != 0 or self.width % 8 != 0: - raise ValueError(f"`height` and `width` have to be divisible by 8 but are {self.height} and {self.width}.") - - if (callback_steps is None) or (callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)): - raise ValueError(f"`callback_steps` has to be a positive integer but is {callback_steps} of type" f" {type(callback_steps)}.") - - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError("Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.") - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - - def decode_latents(self, latents): - scale_factor = 0.18215 - latents = 1 / scale_factor * latents - - batch_size, channels, num_frames, height, width = latents.shape - latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width) - image = self.vae_decoder(latents)[0] - image = torch.tensor(image) - video = ( - image[None, :] - .reshape( - ( - batch_size, - num_frames, - -1, - ) - + image.shape[2:] - ) - .permute(0, 2, 1, 3, 4) - ) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - video = video.float() - return video - -Inference with OpenVINO ------------------------ - - - -.. code:: ipython3 - - core = ov.Core() - -Select inference device -~~~~~~~~~~~~~~~~~~~~~~~ - - - -select device from dropdown list for running inference using OpenVINO - -.. code:: ipython3 - - from notebook_utils import device_widget - - device = device_widget() - - device - - - - -.. parsed-literal:: - - Dropdown(description='Device:', index=2, options=('CPU', 'GNA', 'AUTO'), value='AUTO') - - - -.. code:: ipython3 - - %%time - ov_unet = core.compile_model(unet_xml_path, device_name=device.value) - - -.. parsed-literal:: - - CPU times: user 10.9 s, sys: 4.63 s, total: 15.5 s - Wall time: 8.67 s - - -.. code:: ipython3 - - %%time - ov_vae_decoder = core.compile_model(vae_decoder_xml_path, device_name=device.value) - - -.. parsed-literal:: - - CPU times: user 432 ms, sys: 251 ms, total: 683 ms - Wall time: 337 ms - - -.. code:: ipython3 - - %%time - ov_text_encoder = core.compile_model(text_encoder_xml, device_name=device.value) - - -.. parsed-literal:: - - CPU times: user 1.23 s, sys: 1.19 s, total: 2.43 s - Wall time: 1.11 s - - -Here we replace the pipeline parts with versions converted to OpenVINO -IR and compiled to specific device. Note that we use original pipeline -tokenizer and scheduler. - -.. code:: ipython3 - - ov_pipe = OVTextToVideoSDPipeline(ov_vae_decoder, ov_text_encoder, tokenizer, ov_unet, scheduler) - -Define a prompt -~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - prompt = "A panda eating bamboo on a rock." - -Let’s generate a video for our prompt. For full list of arguments, see -``__call__`` function definition of ``OVTextToVideoSDPipeline`` class in -`Build a pipeline <#build-a-pipeline>`__ section. - -Video generation -~~~~~~~~~~~~~~~~ - - - -.. code:: ipython3 - - frames = ov_pipe(prompt, num_inference_steps=25)["frames"] - - - -.. parsed-literal:: - - 0%| | 0/25 [00:00') - - - - -.. raw:: html - - - - - -Interactive demo ----------------- - - - -.. code:: ipython3 - - def generate(prompt, seed, num_inference_steps, _=gr.Progress(track_tqdm=True)): - generator = torch.Generator().manual_seed(seed) - frames = ov_pipe( - prompt, - num_inference_steps=num_inference_steps, - generator=generator, - )["frames"] - out_file = tempfile.NamedTemporaryFile(suffix=".gif", delete=False) - images = [PIL.Image.fromarray(frame) for frame in frames] - images[0].save(out_file, save_all=True, append_images=images[1:], duration=125, loop=0) - return out_file.name - -.. code:: ipython3 - - if not Path("gradio_helper.py").exists(): - r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/zeroscope-text2video/gradio_helper.py") - open("gradio_helper.py", "w").write(r.text) - - from gradio_helper import make_demo - - demo = make_demo(fn=generate) - - try: - demo.queue().launch(debug=False) - except Exception: - demo.queue().launch(share=True, debug=False) - # If you are launching remotely, specify server_name and server_port - # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` - # To learn more please refer to the Gradio docs: https://gradio.app/docs/ - -.. code:: ipython3 - - # please uncomment and run this cell for stopping gradio interface - # demo.close() diff --git a/samples/js/node/notebooks/vision-background-removal.nnb b/samples/js/node/notebooks/vision-background-removal.nnb index 48cebf48770356..27a777b36669ba 100644 --- a/samples/js/node/notebooks/vision-background-removal.nnb +++ b/samples/js/node/notebooks/vision-background-removal.nnb @@ -38,7 +38,7 @@ { "language": "typescript", "source": [ - "// Details about this normalization:\n// https://docs.openvino.ai/2025/notebooks/vision-background-removal-with-output.html#load-and-pre-process-input-image\nfunction normalizeImage(imageData, width, height) {\n // Mean and scale values\n const inputMean = [123.675, 116.28, 103.53];\n const inputScale = [58.395, 57.12, 57.375];\n\n const normalizedData = new Float32Array(imageData.length);\n const channels = 3;\n\n for (let i = 0; i < height; i++) {\n for (let j = 0; j < width; j++) {\n for (let c = 0; c < channels; c++) {\n const index = i * width * channels + j * channels + c;\n\n normalizedData[index] =\n (imageData[index] - inputMean[c]) / inputScale[c];\n }\n }\n }\n\n return normalizedData;\n}" + "// Details about this normalization:\n// https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/vision-background-removal/vision-background-removal.ipynb#Load-and-Pre-Process-Input-Image\nfunction normalizeImage(imageData, width, height) {\n // Mean and scale values\n const inputMean = [123.675, 116.28, 103.53];\n const inputScale = [58.395, 57.12, 57.375];\n\n const normalizedData = new Float32Array(imageData.length);\n const channels = 3;\n\n for (let i = 0; i < height; i++) {\n for (let j = 0; j < width; j++) {\n for (let c = 0; c < channels; c++) {\n const index = i * width * channels + j * channels + c;\n\n normalizedData[index] =\n (imageData[index] - inputMean[c]) / inputScale[c];\n }\n }\n }\n\n return normalizedData;\n}" ], "outputs": [] }, diff --git a/samples/js/node/vision_background_removal/vision_background_removal.js b/samples/js/node/vision_background_removal/vision_background_removal.js index e8687b8abd3bd8..02a7e4616b6ab9 100644 --- a/samples/js/node/vision_background_removal/vision_background_removal.js +++ b/samples/js/node/vision_background_removal/vision_background_removal.js @@ -112,7 +112,7 @@ async function main( } // Details about this normalization: -// https://docs.openvino.ai/2025/notebooks/vision-background-removal-with-output.html#load-and-pre-process-input-image +// https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/vision-background-removal/vision-background-removal.ipynb#Load-and-Pre-Process-Input-Image function normalizeImage(imageData, width, height) { // Mean and scale values const inputMean = [123.675, 116.28, 103.53];