diff --git a/CHANGELOG.md b/CHANGELOG.md
index dade5a4a1..57e6a4234 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,14 @@
 
 ### Fixes and improvements
 
+## [v4.3.0](https://github.com/OpenNMT/CTranslate2/releases/tag/v4.3.0) (2024-05-17)
+
+### New features
+* Support phi-3 (8k and 128k) (#1700 and #1680)
+
+### Fixes and improvements
+* Fix regression Flash Attention (#1695)
+
 ## [v4.2.1](https://github.com/OpenNMT/CTranslate2/releases/tag/v4.2.1) (2024-04-24)
 
 Note: Because of the increasing of package's size (> 100 MB), the release v4.2.0 was pushed unsuccessfully.
diff --git a/python/ctranslate2/version.py b/python/ctranslate2/version.py
index 904d573c1..283149802 100644
--- a/python/ctranslate2/version.py
+++ b/python/ctranslate2/version.py
@@ -1,3 +1,3 @@
 """Version information."""
 
-__version__ = "4.2.1"
+__version__ = "4.3.0"
diff --git a/src/devices.cc b/src/devices.cc
index 47582f8be..a2936e0a6 100644
--- a/src/devices.cc
+++ b/src/devices.cc
@@ -196,7 +196,7 @@ namespace ctranslate2 {
     for (auto* comm : _nccl_comms) {
         //finalizing NCCL
         if (*comm) {
-          NCCL_CHECK(ncclCommAbort(*comm));
+          NCCL_CHECK(ncclCommFinalize(*comm));
           NCCL_CHECK(ncclCommDestroy(*comm));
         }
     }
diff --git a/src/layers/attention.cc b/src/layers/attention.cc
index f340c44f9..18e2710f7 100644
--- a/src/layers/attention.cc
+++ b/src/layers/attention.cc
@@ -1,5 +1,7 @@
 #include "ctranslate2/layers/attention.h"
 #include "ctranslate2/ops/split.h"
+#include "ctranslate2/utils.h"
+
 
 #include <algorithm>
 #include <cmath>
@@ -210,11 +212,20 @@ namespace ctranslate2 {
                                                  is_decoder,
                                                  with_cache ? key_length - 1 : 0);
         }
+        StorageView* position_bias_per_gpu = position_bias;
+        StorageView position_bias_tmp(position_bias->dtype(), position_bias->device());
+        if (ScopedMPISetter::getCurRank() != 0) {
+          const dim_t num_head_per_gpu = SAFE_DIVIDE(position_bias->dim(0), ScopedMPISetter::getNRanks());
+          ops::Slide slide_ops(0, num_head_per_gpu * ScopedMPISetter::getCurRank(),
+                               num_head_per_gpu, true);
+          slide_ops(*position_bias, position_bias_tmp);
+          position_bias_per_gpu = &position_bias_tmp;
+        }
 
         DEVICE_AND_TYPE_DISPATCH(output.device(), output.dtype(),
-                                 primitives<D>::add_batch_broadcast(position_bias->data<T>(),
+                                 primitives<D>::add_batch_broadcast(position_bias_per_gpu->data<T>(),
                                                                     output.data<T>(),
-                                                                    position_bias->size(),
+                                                                    position_bias_per_gpu->size(),
                                                                     output.size()));
       }