From 2f660645eb1e086963eb58b69c10b75fd4480937 Mon Sep 17 00:00:00 2001
From: Siddartha Pothapragada <sidart@meta.com>
Date: Wed, 6 May 2026 09:26:53 -0700
Subject: [PATCH] Fix C++ -Werror regressions in llama runner (#19326)

Summary:

Fixes 3 `-Werror` diagnostics that broke the qualcomm llama runner build on
`cfg:android-arm64-clang19-no-san` and disabled the following test infra
targets:

- `xplat/executorch/examples/qualcomm/oss_scripts/llama:runner_lib`
- `xplat/executorch/examples/qualcomm/oss_scripts/llama:runner_lib_static`
- `xplat/executorch/examples/qualcomm/oss_scripts/llama:qnn_llama_runner`
- `xplat/executorch/examples/qualcomm/oss_scripts/llama:qnn_llama_runner_static`

Three diagnostics fixed:

1. `-Wreorder-ctor` in `runner.cpp`: `attention_sink_rope_module_` is
   declared as the 2nd field of `Runner<T>` (right after `module_`) but the
   constructor initializer list appended it last, after `tokenizer_`. Moved
   it to the correct position in the init list to match declaration order.
   Recent regression introduced in the attention-sink diff (#16574).

2. `-Woverloaded-virtual` in `lhd_token_generator.h` and
   `multimodal_lhd_token_generator.h`: the derived classes define a
   `prepare_io(std::vector<uint64_t>, std::vector<int32_t>)` overload that
   hides the base class virtual `prepare_io(uint64_t, int64_t)`. Added a
   `using TokenGenerator<T>::prepare_io;` (and equivalent for the
   multimodal hierarchy) declaration so the base virtual stays in scope and
   the warning is silenced without changing behavior. Latent bug surfaced
   by the clang19 toolchain bump.

3. `-Wdelete-non-abstract-non-virtual-dtor` in `prompt_processor.h`:
   `PromptProcessor<T>` has virtual member functions but no virtual
   destructor, so deleting via `std::unique_ptr<PromptProcessor<T>>` in
   `Runner` was undefined behavior under strict warnings. Added
   `virtual ~PromptProcessor() = default;` mirroring the pattern already
   used in `TokenGenerator` (`token_generator.h`). Also transitively
   fixes `MultimodalPromptProcessor<T>`.

Reviewed By: rascani

Differential Revision: D103991803
---
 .../qualcomm/oss_scripts/llama/runner/lhd_token_generator.h   | 3 +++
 .../runner/multimodal_runner/multimodal_lhd_token_generator.h | 3 +++
 examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h | 2 ++
 examples/qualcomm/oss_scripts/llama/runner/runner.cpp         | 4 ++--
 4 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h
index e97f64b7c1d..796dde88014 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h
@@ -102,6 +102,9 @@ class LhdTokenGenerator : public TokenGenerator<T> {
       AttentionSinkRopeRunner* attention_sink_rope_runner) override;
 
  private:
+  // Bring base class's virtual prepare_io into scope so the overload below
+  // does not hide it (-Woverloaded-virtual).
+  using TokenGenerator<T>::prepare_io;
   /**
    * @brief Fill in I/O buffers with prompt token and position.
    * @param cur_token Current token.
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h
index 83da9e7a6ba..7494afec6da 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h
@@ -108,6 +108,9 @@ class MultimodalLhdTokenGenerator
       AttentionSinkRopeRunner* attention_sink_rope_runner) override;
 
  private:
+  // Bring base class's virtual prepare_io into scope so the overload below
+  // does not hide it (-Woverloaded-virtual).
+  using TokenGenerator<T>::prepare_io;
   /**
    * @brief Fill in I/O buffers with prompt token and position.
    * @param cur_token Current token.
diff --git a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h
index 0790985d231..599f7050d83 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h
@@ -40,6 +40,8 @@ class PromptProcessor {
       const std::string& method_name,
       Metadata metadata);
 
+  virtual ~PromptProcessor() = default;
+
   /**
    * @brief Initialize I/O tensor and allocate I/O data buffer.
    * @param buffer_manager Pointer to IMemAlloc instance; by default, it uses a
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
index 0e9b7860dbd..0a4a8b9abb5 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -102,6 +102,7 @@ Runner<T>::Runner(
     std::unique_ptr<tokenizers::Tokenizer> tokenizer,
     std::unique_ptr<executorch::extension::Module> attention_sink_rope_module)
     : module_(std::move(module)),
+      attention_sink_rope_module_(std::move(attention_sink_rope_module)),
       ngram_(ngram),
       window_(window),
       gcap_(gcap),
@@ -111,8 +112,7 @@ Runner<T>::Runner(
       temperature_(temperature),
       eval_mode_(static_cast<EvalMode>(eval_mode)),
       shared_buffer_(shared_buffer),
-      tokenizer_(std::move(tokenizer)),
-      attention_sink_rope_module_(std::move(attention_sink_rope_module)) {
+      tokenizer_(std::move(tokenizer)) {
   stats_.reset();
 
   if (decoder_model_version == "llama2") {