Skip to content

Commit

Permalink
Merged PR 34540: pymarian: build for multiple python versions; disabl…
Browse files Browse the repository at this point in the history
…e tcmalloc; huggingface backed for gated COMETs

pymarian upgrades
* Support for build for multiple python versions at once;  borrowed a cmake script from AMD
* use "build" instead of "pip wheel"; build is more stable and leaves less junk on file system
* Disable tcmalloc for pymarian
* Added support for [huggingface backend](https://huggingface.co/collections/Unbabel/marian-comet-metrics-and-qe-664e28c82743db6709d022fc). Currently enabled for gated comet models only.
* Added `--cache` argument to pymarian-eval CLI; Useful for accessing cache from blobstorage mount path for gated models
  • Loading branch information
Thamme Gowda authored and Roman Grundkiewicz committed Jun 27, 2024
1 parent 07042cf commit a6ab8af
Show file tree
Hide file tree
Showing 11 changed files with 296 additions and 60 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

## [Unreleased]
- Fixed compilation with clang 16.0.6
- Added Threads::Threads to EXT_LIBS

- Added Threads::Threads to `EXT_LIBS`
- Updates to pymarian: building for multiple python versions; disabling tcmalloc; hosting gated COMETs on HuggingFace

### Added
- Added `--normalize-gradient-by-ratio` to mildly adapt gradient magnitude if effective batch size diverges from running average effective batch size.
Expand Down
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,12 @@ set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
set(THREADS_PREFER_PTHREAD_FLAG TRUE)
find_package(Threads REQUIRED)
set(EXT_LIBS ${EXT_LIBS} Threads::Threads)

# disable tcmalloc if pymarian=on
if(USE_TCMALLOC AND PYMARIAN)
message(WARNING "TCMalloc can cause segfaults with some python libraries. Hence disabling TCMalloc for a robust pymarian build.")
set(USE_TCMALLOC off)
endif()
########

###############################################################################
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
v1.12.30
v1.12.31
119 changes: 119 additions & 0 deletions cmake/PythonModules.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# Retrieved from ROCm/AMDMIGraphX repo @ https://github.com/ROCm/AMDMIGraphX/blob/develop/cmake/PythonModules.cmake
#####################################################################################
# The MIT License (MIT)
#
# Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#####################################################################################
if(COMMAND find_python)
return()
endif()


macro(py_exec)
execute_process(${ARGN} RESULT_VARIABLE RESULT)
if(NOT RESULT EQUAL 0)
message(FATAL_ERROR "Process failed: ${ARGN}")
endif()
endmacro()

set(PYBIND11_NOPYTHON On)
# this wont work if pybind11 is git submodule
#find_package(pybind11 REQUIRED)

## =====================
set(PYTHON_SEARCH_VERSIONS 3.7 3.8 3.9 3.10 3.11 3.12 3.13)
set(PYTHON_DISABLE_VERSIONS "" CACHE STRING "")
foreach(PYTHON_DISABLE_VERSION ${PYTHON_DISABLE_VERSIONS})
list(REMOVE_ITEM PYTHON_SEARCH_VERSIONS ${PYTHON_DISABLE_VERSION})
endforeach()

## =====================

macro(find_python version)
find_program(PYTHON_CONFIG_${version} python${version}-config)
if(EXISTS ${PYTHON_CONFIG_${version}})
py_exec(COMMAND ${PYTHON_CONFIG_${version}} --includes OUTPUT_VARIABLE _python_include_args)
execute_process(COMMAND ${PYTHON_CONFIG_${version}} --ldflags --embed OUTPUT_VARIABLE _python_ldflags_args RESULT_VARIABLE _python_ldflags_result)
if(NOT _python_ldflags_result EQUAL 0)
py_exec(COMMAND ${PYTHON_CONFIG_${version}} --ldflags OUTPUT_VARIABLE _python_ldflags_args)
endif()
separate_arguments(_python_includes UNIX_COMMAND "${_python_include_args}")
separate_arguments(_python_ldflags UNIX_COMMAND "${_python_ldflags_args}")
string(REPLACE "-I" "" _python_includes "${_python_includes}")
add_library(python${version}::headers INTERFACE IMPORTED GLOBAL)
set_target_properties(python${version}::headers PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${_python_includes}"
)
add_library(python${version}::runtime INTERFACE IMPORTED GLOBAL)
set_target_properties(python${version}::runtime PROPERTIES
INTERFACE_LINK_OPTIONS "${_python_ldflags}"
INTERFACE_LINK_LIBRARIES python${version}::headers
)
py_exec(COMMAND ${PYTHON_CONFIG_${version}} --prefix OUTPUT_VARIABLE _python_prefix)
string(STRIP "${_python_prefix}" _python_prefix)
set(PYTHON_${version}_EXECUTABLE "${_python_prefix}/bin/python${version}" CACHE PATH "")
endif()
endmacro()

#######
function(py_extension name version)
set(_python_module_extension ".so")
if(version VERSION_GREATER_EQUAL 3.0)
py_exec(COMMAND ${PYTHON_CONFIG_${version}} --extension-suffix OUTPUT_VARIABLE _python_module_extension)
string(STRIP "${_python_module_extension}" _python_module_extension)
endif()
set_target_properties(${name} PROPERTIES PREFIX "" SUFFIX "${_python_module_extension}")
endfunction()

function(py_add_module NAME)
set(options)
set(oneValueArgs PYTHON_VERSION PYTHON_MODULE)
set(multiValueArgs)

cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(PYTHON_VERSION ${PARSE_PYTHON_VERSION})

add_library(${NAME} MODULE ${PARSE_UNPARSED_ARGUMENTS})
pybind11_strip(${NAME})
py_extension(${NAME} ${PYTHON_VERSION})
target_link_libraries(${NAME} PRIVATE pybind11::module pybind11::lto python${PYTHON_VERSION}::headers)
set_target_properties(${NAME} PROPERTIES
OUTPUT_NAME ${PARSE_PYTHON_MODULE}
C_VISIBILITY_PRESET hidden
CXX_VISIBILITY_PRESET hidden
)

endfunction()

###
set(_PYTHON_VERSIONS)
foreach(PYTHON_VERSION ${PYTHON_SEARCH_VERSIONS})
find_python(${PYTHON_VERSION})
if(TARGET python${PYTHON_VERSION}::headers)
message(STATUS "Python ${PYTHON_VERSION} found.")
list(APPEND _PYTHON_VERSIONS ${PYTHON_VERSION})
else()
message(STATUS "Python ${PYTHON_VERSION} not found.")
endif()
endforeach()
# Make the variable global
set(PYTHON_VERSIONS "${_PYTHON_VERSIONS}" CACHE INTERNAL "" FORCE)

45 changes: 28 additions & 17 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -292,24 +292,35 @@ endif(GENERATE_MARIAN_INSTALL_TARGETS)


if(PYMARIAN)
if(NOT PYTHON_EXECUTABLE)
set(PYTHON_EXECUTABLE python) # default to python in the environment
endif()

# python libs which use different version of tcmalloc (e.g. pandas) can cause segfaults, so we disable it
include_directories(3rd_party/pybind11/include)
add_subdirectory(3rd_party/pybind11)
include(PythonModules)
# print all python versions
message(STATUS "Going to look for these Python versions: ${PYTHON_VERSIONS}")
add_custom_target(_pymarian)
foreach(PYTHON_VERSION ${PYTHON_VERSIONS}) # defined in PythonModules.cmake
py_add_module(_pymarian_${PYTHON_VERSION} python/binding/bind.cpp PYTHON_VERSION ${PYTHON_VERSION} PYTHON_MODULE _pymarian)
target_link_libraries(_pymarian_${PYTHON_VERSION} PUBLIC marian ${EXT_LIBS})
if(CUDA_FOUND)
target_link_libraries(_pymarian_${PYTHON_VERSION} PUBLIC marian_cuda)
endif(CUDA_FOUND)
add_dependencies(_pymarian _pymarian_${PYTHON_VERSION})
endforeach()

# ==== make .whl files ====
# IMPORTANT: do not parallelize the wheel builds; they conflict on a few directories (e.g. *.egg-info)
set(LAST_PYMARIAN_TGT "")
foreach(PYTHON_VERSION ${PYTHON_VERSIONS})
add_custom_target(
pymarian_${PYTHON_VERSION} ALL
COMMAND ${PYTHON_${PYTHON_VERSION}_EXECUTABLE} -m pip install --upgrade pip build
COMMAND ${CMAKE_COMMAND} -E env "CMAKE_BINARY_DIR=${PROJECT_BINARY_DIR}" "CMAKE_SOURCE_DIR=${PROJECT_SOURCE_DIR}"
${PYTHON_${PYTHON_VERSION}_EXECUTABLE} -m build --wheel ${PROJECT_SOURCE_DIR}/src/python -o "${PROJECT_BINARY_DIR}"
DEPENDS _pymarian_${PYTHON_VERSION} ${LAST_PYMARIAN_TGT}
VERBATIM COMMENT "===Building pymarian wheel for python${PYTHON_VERSION}==="
)
set(LAST_PYMARIAN_TGT pymarian_${PYTHON_VERSION})
endforeach(PYTHON_VERSION)

pybind11_add_module(_pymarian MODULE python/binding/bind.cpp)
target_link_libraries(_pymarian PUBLIC marian)
if(CUDA_FOUND)
target_link_libraries(_pymarian PUBLIC marian_cuda)
endif(CUDA_FOUND)
install(TARGETS _pymarian DESTINATION .)

# build pymarian wheel
add_custom_target(pymarian ALL
${CMAKE_COMMAND} -E env "CMAKE_BINARY_DIR=${PROJECT_BINARY_DIR}" "CMAKE_SOURCE_DIR=${PROJECT_SOURCE_DIR}"
"${PYTHON_EXECUTABLE}" -m pip wheel -v --no-input ${PROJECT_SOURCE_DIR}/src/python -w "${PROJECT_BINARY_DIR}"
DEPENDS _pymarian
VERBATIM COMMENT "Building pymarian wheel")
endif(PYMARIAN)
68 changes: 67 additions & 1 deletion src/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ cmake --build build -j # -j option parallelizes build on all cpu cores
python -m pip install build/pymarian-*.whl
```

Since the above commands uses `python` executable in the PATH to determine Python version to compile marian native extension, make sure to have the desired `python` executable in your environment _before_ invoking these commands.
The above commands use `python` executable in the PATH to determine Python version for compiling marian native extension. Make sure to have the desired `python` executable in your environment _before_ invoking these cmake commands.

## Python API

Expand Down Expand Up @@ -96,6 +96,7 @@ options:
-ws WORKSPACE, --workspace WORKSPACE
Workspace memory (default: 8000)
-pc, --print-cmd Print marian evaluate command and exit (default: False)
--cache CACHE Cache directory for storing models (default: $HOME/.cache/marian/metric)
More info at https://github.com/marian-nmt/marian-dev. This CLI is loaded from .../python3.10/site-packages/pymarian/eval.py (version: 1.12.25)
Expand Down Expand Up @@ -157,6 +158,71 @@ python -m pytest -s src/python/tests/regression
```
## Release Instructions
### Building Pymarian for Multiple Python Versions
Our CMake scripts detects `python3.*` available in PATH and builds pymarian for each.
To support a specific version of python, make the `python3.x` executable available in PATH prior to running cmake.
This can be achieved by (without conflicts) using `conda` or `mamba`.
```bash
# setup mamba if not already; Note: you may use conda as well
which mamba || {
name=Miniforge3-$(uname)-$(uname -m).sh
wget "https://github.com/conda-forge/miniforge/releases/latest/download/$name" \
&& bash $name -b -p ~/mambaforge && ~/mambaforge/bin/mamba init bash && rm $name
}
# create environment for each version
versions="$(echo 3.{12,11,10,9,8,7})"
for version in $versions; do
echo "python $version"
mamba env list | grep -q "^py${version}" || mamba create -q -y -n py${version} python=${version}
done
# stack all environments
for version in $versions; do mamba activate py${version} --stack; done
# check if all python versions are available
for version in $versions; do which python$version; done
# Build as usual
cmake . -B build -DCOMPILE_CUDA=off -DPYMARIAN=on
cmake --build build -j
ls build/pymarian*.whl
```
### Upload to PyPI
```bash
twine upload -r testpypi build/*.whl
twine upload -r pypi build/*.whl
```
__Initial Setup:__ create `~/.pypirc` with following:
```ini
[distutils]
index-servers =
pypi
testpypi
[pypi]
repository: https://upload.pypi.org/legacy/
username:__token__
password:<token>
[testpypi]
repository: https://test.pypi.org/legacy/
username:__token__
password:<token>
```
Obtain token from https://pypi.org/manage/account/
## Known issues
1. In conda or mamba environment, if you see `.../miniconda3/envs/<envname>/bin/../lib/libstdc++.so.6: version 'GLIBCXX_3.4.30' not found` error,
Expand Down
37 changes: 22 additions & 15 deletions src/python/pymarian/defaults.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from pathlib import Path

import os

class Defaults:
BASE_URL = "https://textmt.blob.core.windows.net/www/marian/metric"
CACHE_PATH = Path.home() / '.cache' / 'marian' / 'metric'

DEF_CACHE_PATH = Path.home() / '.cache' / 'marian' / 'metric'
# user might also change this from CLI at runtime
CACHE_PATH = Path(os.environ['MARIAN_CACHE']) if os.environ.get('MARIAN_CACHE', '').strip() else DEF_CACHE_PATH
MINI_BATCH = 16
MAXI_BATCH = 256
WORKSPACE = 8000
Expand All @@ -12,20 +15,24 @@ class Defaults:
FLOAT_PRECISION = 4
FILE_LOCK_TIMEOUT = 1 * 60 * 60 # seconds => 1 hour
PROGRESS_BAR = True

# metric name to model type; lowercase all IDs
HUGGINGFACE = "huggingface"
AZURE = "azure"
COMET_VOCAB_REPO = "microsoft/infoxlm-large"
# metric id -> (model_type, huggingface_org/model_id)
# unbabel agreed to host models within their org and added the same gating/licensing mechanism
# we hosted bleurt ourself (Apache2.0) on https://huggingface.co/marian-nmt
KNOWN_METRICS = {
"bleurt-20": "bleurt",
"wmt20-comet-da": "comet",
"wmt20-comet-qe-da": "comet-qe",
"wmt20-comet-qe-da-v2": "comet-qe",
"wmt21-comet-da": "comet",
"wmt21-comet-qe-da": "comet-qe",
"wmt21-comet-qe-mqm": "comet-qe",
"wmt22-comet-da": "comet",
"wmt22-cometkiwi-da": "comet-qe",
"xcomet-xl": "comet",
"xcomet-xxL": "comet",
"bleurt-20": ["bleurt", "marian-nmt/bleurt-20"],
"wmt20-comet-da": ["comet", "unbabel/wmt20-comet-da-marian"],
"wmt20-comet-qe-da": ["comet-qe", "unbabel/wmt20-comet-qe-da-marian"],
"wmt20-comet-qe-da-v2": ["comet-qe", "unbabel/wmt20-comet-qe-da-v2-marian"],
"wmt21-comet-da": ["comet", "unbabel/wmt21-comet-da-marian"],
"wmt21-comet-qe-da": ["comet-qe", "unbabel/wmt21-comet-qe-da-marian"],
"wmt21-comet-qe-mqm": ["comet-qe", "unbabel/wmt21-comet-qe-mqm-marian"],
"wmt22-comet-da": ["comet", "unbabel/wmt22-comet-da-marian"],
"wmt22-cometkiwi-da": ["comet-qe", "unbabel/wmt22-cometkiwi-da-marian"],
"wmt23-cometkiwi-da-xl": ["comet-qe", "unbabel/wmt23-cometkiwi-da-xl-marian"],
"wmt23-cometkiwi-da-xxl": ["comet-qe", "unbabel/wmt23-cometkiwi-da-xxl-marian"],
}

# model type to field order
Expand Down
6 changes: 4 additions & 2 deletions src/python/pymarian/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def parse_args():
f'This CLI is loaded from {__file__} (version: {__version__})',
)

known_metrics = ', '.join(Defaults.KNOWN_METRICS)
known_metrics = ', '.join(Defaults.KNOWN_METRICS.keys())
parser.add_argument(
'-m',
'--model',
Expand Down Expand Up @@ -89,6 +89,7 @@ def parse_args():
parser.add_argument(
'-pc', '--print-cmd', action="store_true", help="Print marian evaluate command and exit"
)
parser.add_argument('--cache', help='Cache directory for storing models', type=Path, default=Defaults.CACHE_PATH)

args = parser.parse_args()
return vars(args)
Expand Down Expand Up @@ -197,6 +198,7 @@ def main(**args):
log.debug(args)
else:
args['quiet'] = ''
Defaults.CACHE_PATH = args.pop('cache')

model_id = args.pop('model')
model_path = Path(model_id)
Expand All @@ -221,7 +223,7 @@ def main(**args):
model_path = get_model_path(model_id)
if not vocab_path: # if vocab is not given, resolve it from cache
vocab_path = get_vocab_path(model_id)
args['like'] = Defaults.KNOWN_METRICS.get(model_id, Defaults.DEF_MODEL_TYPE)
args['like'] = Defaults.KNOWN_METRICS.get(model_id, [Defaults.DEF_MODEL_TYPE])[0]
except ValueError as e:
raise ValueError(f'Invalid model ID: {model_id}') from e

Expand Down
Loading

0 comments on commit a6ab8af

Please sign in to comment.