Merge branch 'OpenNMT:master' into master

OpenNMT · Sep 10, 2024 · 14e4c4c · 14e4c4c
2 parents 3d76464 + 8f4d134
commit 14e4c4c
Show file tree

Hide file tree

Showing 24 changed files with 309 additions and 16 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,23 @@
 
 ### Fixes and improvements
 
+## [v4.4.0](https://github.com/OpenNMT/CTranslate2/releases/tag/v4.4.0) (2024-09-09)
+**Removed**: Flash Attention support in the Python package due to significant package size increase with minimal performance gain.  
+Note: Flash Attention remains supported in the C++ package with the `WITH_FLASH_ATTN` option.  
+Flash Attention may be re-added in the future if substantial improvements are made.
+
+### New features
+* Support Llama3 (#1751)
+* Support Gemma2 (1772)
+* Add log probs for all tokens in vocab (#1755)
+* Grouped conv1d (#1749 + #1758)
+
+### Fixes and improvements
+* Fix pipeline (#1723 + #1747)
+* Some improvements in flash attention (#1732)
+* Fix crash when using return_alternative on CUDA (#1733)
+* Quantization AWQ GEMM + GEMV (#1727)
+
 ## [v4.3.1](https://github.com/OpenNMT/CTranslate2/releases/tag/v4.3.1) (2024-06-10)
 Note: Because of exceeding project's size on Pypi (> 20 GB), the release v4.3.0 was pushed unsuccessfully.
 

diff --git a/include/ctranslate2/decoding.h b/include/ctranslate2/decoding.h
@@ -15,14 +15,16 @@ namespace ctranslate2 {
     std::vector<std::vector<size_t>> hypotheses;
     std::vector<float> scores;
     std::vector<std::vector<std::vector<float>>> attention;
+    std::vector<std::vector<StorageView>> logits_vocab;
   };
 
   struct DecodingStepResult {
     size_t step;
     size_t batch_id;
     size_t token_id;
     size_t hypothesis_id;
-    std::optional<float> log_prob;
+    std::optional<float> score;
+    std::optional<StorageView> logits;
     bool is_last = false;
   };
 
@@ -41,6 +43,7 @@ namespace ctranslate2 {
            const dim_t min_length,
            const bool return_scores = false,
            const bool return_attention = false,
+           const bool return_logits_vocab = true,
            const bool return_prefix = true,
            const size_t num_hypotheses = 1,
            const bool include_eos_in_hypotheses = true,
@@ -67,6 +70,7 @@ namespace ctranslate2 {
            const dim_t min_length,
            const bool return_scores = false,
            const bool return_attention = false,
+           const bool return_logits_vocab = true,
            const bool return_prefix = true,
            const size_t num_hypotheses = 1,
            const bool include_eos_in_hypotheses = true,
@@ -118,6 +122,7 @@ namespace ctranslate2 {
            const dim_t min_length,
            const bool return_scores = false,
            const bool return_attention = false,
+           const bool return_logits_vocab = true,
            const bool return_prefix = true,
            const size_t num_hypotheses = 1,
            const bool include_eos_in_hypotheses = true,
@@ -149,6 +154,7 @@ namespace ctranslate2 {
     bool include_eos_in_hypotheses = true;
     bool return_scores = false;
     bool return_attention = false;
+    bool return_logits_vocab = false;
     bool return_alternatives = false;
     bool return_prefix = true;
     float min_alternative_expansion_prob = 0;

diff --git a/include/ctranslate2/generation.h b/include/ctranslate2/generation.h
@@ -53,6 +53,8 @@ namespace ctranslate2 {
 
     // Include scores in the result.
     bool return_scores = false;
+    // Include log probs of each token in the result
+    bool return_logits_vocab = false;
 
     // Return alternatives at the first unconstrained decoding position. This is typically
     // used with a prefix to provide alternatives at a specifc location.
@@ -79,6 +81,7 @@ namespace ctranslate2 {
     std::vector<std::vector<std::string>> sequences;
     std::vector<std::vector<size_t>> sequences_ids;
     std::vector<float> scores;
+    std::vector<std::vector<StorageView>> logits;
 
     size_t num_sequences() const {
       return sequences.size();
@@ -95,7 +98,8 @@ namespace ctranslate2 {
     size_t token_id;
     size_t hypothesis_id;
     std::string token;
-    std::optional<float> log_prob;
+    std::optional<float> score;
+    std::optional<StorageView> logits;
     bool is_last;
 
     GenerationStepResult() = default;
@@ -105,7 +109,8 @@ namespace ctranslate2 {
       , token_id(result.token_id)
       , hypothesis_id(result.hypothesis_id)
       , token(vocabulary.to_token(result.token_id))
-      , log_prob(result.log_prob)
+      , score(result.score)
+      , logits(result.logits)
       , is_last(result.is_last)
     {
     }

diff --git a/include/ctranslate2/layers/transformer.h b/include/ctranslate2/layers/transformer.h
@@ -119,6 +119,8 @@ namespace ctranslate2 {
       const std::unique_ptr<const LayerNorm> _shared_layer_norm;
       const std::unique_ptr<const LayerNorm> _input_layer_norm;
       const std::unique_ptr<const LayerNorm> _post_attention_layer_norm;
+      const std::unique_ptr<const LayerNorm> _pre_feedforward_layer_norm;
+      const std::unique_ptr<const LayerNorm> _post_feedforward_layer_norm;
       const std::unique_ptr<const AttentionLayer> _encoder_attention;
       const FeedForwardNetwork _ff;
     };

diff --git a/include/ctranslate2/models/whisper.h b/include/ctranslate2/models/whisper.h
@@ -41,6 +41,9 @@ namespace ctranslate2 {
       // Include scores in the result.
       bool return_scores = false;
 
+      // Include log probs of each token in the result
+      bool return_logits_vocab = false;
+
       // Include the probability of the no speech token in the result.
       bool return_no_speech_prob = false;
 
@@ -59,6 +62,7 @@ namespace ctranslate2 {
       std::vector<std::vector<std::string>> sequences;
       std::vector<std::vector<size_t>> sequences_ids;
       std::vector<float> scores;
+      std::vector<std::vector<StorageView>> logits;
       float no_speech_prob = 0;
 
       size_t num_sequences() const {

diff --git a/include/ctranslate2/translation.h b/include/ctranslate2/translation.h
@@ -67,6 +67,8 @@ namespace ctranslate2 {
     bool return_scores = false;
     // Store attention vectors in the TranslationResult class.
     bool return_attention = false;
+    // Store log probs matrix in the TranslationResult class.
+    bool return_logits_vocab = false;
 
     // Return alternatives at the first unconstrained decoding position. This is typically
     // used with a target prefix to provide alternatives at a specifc location in the
@@ -87,6 +89,7 @@ namespace ctranslate2 {
     std::vector<std::vector<std::string>> hypotheses;
     std::vector<float> scores;
     std::vector<std::vector<std::vector<float>>> attention;
+    std::vector<std::vector<StorageView>> logits;
 
     TranslationResult(std::vector<std::vector<std::string>> hypotheses_)
       : hypotheses(std::move(hypotheses_))
@@ -95,10 +98,12 @@ namespace ctranslate2 {
 
     TranslationResult(std::vector<std::vector<std::string>> hypotheses_,
                       std::vector<float> scores_,
-                      std::vector<std::vector<std::vector<float>>> attention_)
+                      std::vector<std::vector<std::vector<float>>> attention_,
+                      std::vector<std::vector<StorageView>> logits_)
       : hypotheses(std::move(hypotheses_))
       , scores(std::move(scores_))
       , attention(std::move(attention_))
+      , logits(std::move(logits_))
     {
     }
 

diff --git a/python/cpp/generation_result.cc b/python/cpp/generation_result.cc
@@ -21,8 +21,10 @@ namespace ctranslate2 {
                       "Index of the hypothesis in the batch.")
         .def_readonly("token", &GenerationStepResult::token,
                       "String value of the generated token.")
-        .def_readonly("log_prob", &GenerationStepResult::log_prob,
+        .def_readonly("log_prob", &GenerationStepResult::score,
                       "Log probability of the token (``None`` if :obj:`return_log_prob` was disabled).")
+        .def_readonly("logits", &GenerationStepResult::logits,
+                      "Log probability on the vocab of all tokens.")
         .def_readonly("is_last", &GenerationStepResult::is_last,
                       "Whether this step is the last decoding step for this batch.")
 
@@ -32,7 +34,8 @@ namespace ctranslate2 {
             + ", token_id=" + std::string(py::repr(py::cast(result.token_id)))
             + ", hypothesis_id=" + std::string(py::repr(py::cast(result.hypothesis_id)))
             + ", token=" + std::string(py::repr(py::cast(result.token)))
-            + ", log_prob=" + std::string(py::repr(py::cast(result.log_prob)))
+            + ", log_prob=" + std::string(py::repr(py::cast(result.score)))
+            + ", logits=" + std::string(py::repr(py::cast(result.logits)))
             + ", is_last=" + std::string(py::repr(py::cast(result.is_last)))
             + ")";
         })
@@ -46,11 +49,14 @@ namespace ctranslate2 {
                       "Generated sequences of token IDs.")
         .def_readonly("scores", &GenerationResult::scores,
                       "Score of each sequence (empty if :obj:`return_scores` was disabled).")
+        .def_readonly("logits", &GenerationResult::logits,
+                      "Score of each sequence (empty if :obj:`return_logits_vocab` was disabled).")
 
         .def("__repr__", [](const GenerationResult& result) {
           return "GenerationResult(sequences=" + std::string(py::repr(py::cast(result.sequences)))
             + ", sequences_ids=" + std::string(py::repr(py::cast(result.sequences_ids)))
             + ", scores=" + std::string(py::repr(py::cast(result.scores)))
+            + ", logits=" + std::string(py::repr(py::cast(result.logits)))
             + ")";
         })
         ;

diff --git a/python/cpp/generator.cc b/python/cpp/generator.cc
@@ -33,6 +33,7 @@ namespace ctranslate2 {
                      bool cache_static_prompt,
                      bool include_prompt_in_result,
                      bool return_scores,
+                     bool return_logits_vocab,
                      bool return_alternatives,
                      float min_alternative_expansion_prob,
                      size_t sampling_topk,
@@ -58,6 +59,7 @@ namespace ctranslate2 {
         options.num_hypotheses = num_hypotheses;
         options.return_end_token = return_end_token;
         options.return_scores = return_scores;
+        options.return_logits_vocab = return_logits_vocab;
         options.return_alternatives = return_alternatives;
         options.cache_static_prompt = cache_static_prompt;
         options.include_prompt_in_result = include_prompt_in_result;
@@ -203,6 +205,7 @@ namespace ctranslate2 {
              py::arg("cache_static_prompt")=true,
              py::arg("include_prompt_in_result")=true,
              py::arg("return_scores")=false,
+             py::arg("return_logits_vocab")=false,
              py::arg("return_alternatives")=false,
              py::arg("min_alternative_expansion_prob")=0,
              py::arg("sampling_topk")=1,
@@ -260,6 +263,7 @@ namespace ctranslate2 {
                      reuse it for future generations using the same static prompt.
                    include_prompt_in_result: Include the :obj:`start_tokens` in the result.
                    return_scores: Include the scores in the output.
+                   return_logits_vocab: Include log probs for each token in the output
                    return_alternatives: Return alternatives at the first unconstrained decoding position.
                    min_alternative_expansion_prob: Minimum initial probability to expand an alternative.
                    sampling_topk: Randomly sample predictions from the top K candidates.

diff --git a/python/cpp/storage_view.cc b/python/cpp/storage_view.cc
@@ -192,6 +192,12 @@ namespace ctranslate2 {
           return stream.str();
         })
 
+        .def("__repr__", [](const StorageView& view) {
+          std::ostringstream stream;
+          stream << view;
+          return stream.str();
+        })
+
         .def("to",
              [](const StorageView& view, DataType dtype) {
                ScopedDeviceSetter device_setter(view.device(), view.device_index());

diff --git a/python/cpp/translation_result.cc b/python/cpp/translation_result.cc
@@ -16,11 +16,14 @@ namespace ctranslate2 {
                       "Score of each translation hypothesis (empty if :obj:`return_scores` was disabled).")
         .def_readonly("attention", &TranslationResult::attention,
                       "Attention matrix of each translation hypothesis (empty if :obj:`return_attention` was disabled).")
+        .def_readonly("logits", &TranslationResult::logits,
+                      "Score of each translation hypothesis (empty if :obj:`return_logits_vocab` was disabled).")
 
         .def("__repr__", [](const TranslationResult& result) {
           return "TranslationResult(hypotheses=" + std::string(py::repr(py::cast(result.hypotheses)))
             + ", scores=" + std::string(py::repr(py::cast(result.scores)))
             + ", attention=" + std::string(py::repr(py::cast(result.attention)))
+            + ", logits=" + std::string(py::repr(py::cast(result.logits)))
             + ")";
         })
 

diff --git a/python/cpp/translator.cc b/python/cpp/translator.cc
@@ -141,6 +141,7 @@ namespace ctranslate2 {
                       size_t min_decoding_length,
                       bool use_vmap,
                       bool return_scores,
+                      bool return_logits_vocab,
                       bool return_attention,
                       bool return_alternatives,
                       float min_alternative_expansion_prob,
@@ -172,6 +173,7 @@ namespace ctranslate2 {
         options.use_vmap = use_vmap;
         options.return_end_token = return_end_token;
         options.return_scores = return_scores;
+        options.return_logits_vocab = return_logits_vocab;
         options.return_attention = return_attention;
         options.return_alternatives = return_alternatives;
         options.min_alternative_expansion_prob = min_alternative_expansion_prob;
@@ -354,6 +356,7 @@ namespace ctranslate2 {
              py::arg("min_decoding_length")=1,
              py::arg("use_vmap")=false,
              py::arg("return_scores")=false,
+             py::arg("return_logits_vocab")=false,
              py::arg("return_attention")=false,
              py::arg("return_alternatives")=false,
              py::arg("min_alternative_expansion_prob")=0,
@@ -396,6 +399,7 @@ namespace ctranslate2 {
                    min_decoding_length: Minimum prediction length.
                    use_vmap: Use the vocabulary mapping file saved in this model
                    return_scores: Include the scores in the output.
+                   return_logits_vocab: Include the log probs of each token in the output
                    return_attention: Include the attention vectors in the output.
                    return_alternatives: Return alternatives at the first unconstrained decoding position.
                    min_alternative_expansion_prob: Minimum initial probability to expand an alternative.

diff --git a/python/cpp/whisper.cc b/python/cpp/whisper.cc
@@ -40,6 +40,7 @@ namespace ctranslate2 {
                size_t no_repeat_ngram_size,
                size_t max_length,
                bool return_scores,
+               bool return_logits_vocab,
                bool return_no_speech_prob,
                size_t max_initial_timestamp_index,
                bool suppress_blank,
@@ -59,6 +60,7 @@ namespace ctranslate2 {
         options.max_length = max_length;
         options.num_hypotheses = num_hypotheses;
         options.return_scores = return_scores;
+        options.return_logits_vocab = return_logits_vocab;
         options.return_no_speech_prob = return_no_speech_prob;
         options.max_initial_timestamp_index = max_initial_timestamp_index;
         options.suppress_blank = suppress_blank;
@@ -247,6 +249,7 @@ namespace ctranslate2 {
              py::arg("no_repeat_ngram_size")=0,
              py::arg("max_length")=448,
              py::arg("return_scores")=false,
+             py::arg("return_logits_vocab")=false,
              py::arg("return_no_speech_prob")=false,
              py::arg("max_initial_timestamp_index")=50,
              py::arg("suppress_blank")=true,
@@ -276,6 +279,7 @@ namespace ctranslate2 {
                      (set 0 to disable).
                    max_length: Maximum generation length.
                    return_scores: Include the scores in the output.
+                   return_logits_vocab: Include the log probs in the output
                    return_no_speech_prob: Include the probability of the no speech token in the
                      result.
                    max_initial_timestamp_index: Maximum index of the first predicted timestamp.