diff --git a/CHANGELOG.md b/CHANGELOG.md index dade5a4a1..57e6a4234 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,14 @@ ### Fixes and improvements +## [v4.3.0](https://github.com/OpenNMT/CTranslate2/releases/tag/v4.3.0) (2024-05-17) + +### New features +* Support phi-3 (8k and 128k) (#1700 and #1680) + +### Fixes and improvements +* Fix regression Flash Attention (#1695) + ## [v4.2.1](https://github.com/OpenNMT/CTranslate2/releases/tag/v4.2.1) (2024-04-24) Note: Because of the increasing of package's size (> 100 MB), the release v4.2.0 was pushed unsuccessfully. diff --git a/python/ctranslate2/version.py b/python/ctranslate2/version.py index 904d573c1..283149802 100644 --- a/python/ctranslate2/version.py +++ b/python/ctranslate2/version.py @@ -1,3 +1,3 @@ """Version information.""" -__version__ = "4.2.1" +__version__ = "4.3.0" diff --git a/src/devices.cc b/src/devices.cc index 47582f8be..a2936e0a6 100644 --- a/src/devices.cc +++ b/src/devices.cc @@ -196,7 +196,7 @@ namespace ctranslate2 { for (auto* comm : _nccl_comms) { //finalizing NCCL if (*comm) { - NCCL_CHECK(ncclCommAbort(*comm)); + NCCL_CHECK(ncclCommFinalize(*comm)); NCCL_CHECK(ncclCommDestroy(*comm)); } } diff --git a/src/layers/attention.cc b/src/layers/attention.cc index f340c44f9..18e2710f7 100644 --- a/src/layers/attention.cc +++ b/src/layers/attention.cc @@ -1,5 +1,7 @@ #include "ctranslate2/layers/attention.h" #include "ctranslate2/ops/split.h" +#include "ctranslate2/utils.h" + #include #include @@ -210,11 +212,20 @@ namespace ctranslate2 { is_decoder, with_cache ? key_length - 1 : 0); } + StorageView* position_bias_per_gpu = position_bias; + StorageView position_bias_tmp(position_bias->dtype(), position_bias->device()); + if (ScopedMPISetter::getCurRank() != 0) { + const dim_t num_head_per_gpu = SAFE_DIVIDE(position_bias->dim(0), ScopedMPISetter::getNRanks()); + ops::Slide slide_ops(0, num_head_per_gpu * ScopedMPISetter::getCurRank(), + num_head_per_gpu, true); + slide_ops(*position_bias, position_bias_tmp); + position_bias_per_gpu = &position_bias_tmp; + } DEVICE_AND_TYPE_DISPATCH(output.device(), output.dtype(), - primitives::add_batch_broadcast(position_bias->data(), + primitives::add_batch_broadcast(position_bias_per_gpu->data(), output.data(), - position_bias->size(), + position_bias_per_gpu->size(), output.size())); }