axodox · ZDisket · Feb 27, 2024 · Mar 11, 2024 · Mar 11, 2024
diff --git a/Axodox.MachineLearning.Shared/MachineLearning/Schedulers/DpmPlusPlus2MScheduler.cpp b/Axodox.MachineLearning.Shared/MachineLearning/Schedulers/DpmPlusPlus2MScheduler.cpp
@@ -5,78 +5,96 @@ using namespace std;
 
 namespace Axodox::MachineLearning
 {
-  DpmPlusPlus2MScheduler::DpmPlusPlus2MScheduler(const StableDiffusionSchedulerOptions& options) :
-    StableDiffusionScheduler(options)
-  {
-    //Apply Karras sigmas
-    const auto rho = 7.f;
-
-    auto sigmaMax = _sigmas.front();
-    auto sigmaMin = *(_sigmas.end() - 2);
-
-    auto invRhoMin = pow(sigmaMin, 1.f / rho);
-    auto invRhoMax = pow(sigmaMax, 1.f / rho);
-
-    auto stepCount = _sigmas.size() - 1;
-    auto stepSize = 1.f / (stepCount - 1);
-    vector<float> timesteps(_timesteps.size());
-    vector<float> sigmas(_sigmas.size());
-    for (auto i = 0; i < stepCount; i++)
+    DpmPlusPlus2MScheduler::DpmPlusPlus2MScheduler(const StableDiffusionSchedulerOptions& options) :
+        StableDiffusionScheduler(options)
     {
-      auto t = i * stepSize;
-      sigmas[i] = pow(invRhoMax + t * (invRhoMin - invRhoMax), rho);
-      timesteps[i] = SigmaToTime(sigmas[i]);
-    }
-
-    _sigmas = move(sigmas);
-    _timesteps = move(timesteps);
+        //Apply Karras sigmas
+        const auto rho = 7.f;
 
-    //_sigmas = { 14.614643096923828f, 10.81978988647461f, 7.9029436111450195f, 5.687837600708008f, 4.027743816375732f, 2.8015711307525635f, 1.9103621244430542f, 1.2741049528121948f,  0.8288719058036804f, 0.524259626865387f, 0.3211216926574707f, 0.18956425786018372f, 0.10719937086105347f, 0.05763240531086922f, 0.02916753850877285f, 0.f };
-    //_timesteps = { 999.f, 947.6224f, 889.5464f, 823.0464f, 745.8676f, 655.3113f, 549.0170f, 427.4898f, 298.6582f, 179.8307f, 89.9427f, 36.5918f, 12.0011f, 2.8839f, 0.f };
-  }
+        auto sigmaMax = _sigmas.front();
+        auto sigmaMin = *(_sigmas.end() - 2);
 
-  Tensor DpmPlusPlus2MScheduler::ApplyStep(const Tensor& input, const Tensor& output, size_t step)
-  {
-    auto currentSigma = _sigmas[step];
-    auto nextSigma = _sigmas[step + 1];
+        auto invRhoMin = pow(sigmaMin, 1.f / rho);
+        auto invRhoMax = pow(sigmaMax, 1.f / rho);
 
-    auto predictedOriginalSample = input.BinaryOperation<float>(output, [currentSigma](float a, float b) { return a - currentSigma * b; });
+        auto stepCount = _sigmas.size() - 1;
+        auto stepSize = 1.f / (stepCount - 1);
+        vector<float> timesteps(_timesteps.size());
+        vector<float> sigmas(_sigmas.size());
+        for (auto i = 0; i < stepCount; i++)
+        {
+            auto t = i * stepSize;
+            sigmas[i] = pow(invRhoMax + t * (invRhoMin - invRhoMax), rho);
+            timesteps[i] = SigmaToTime(sigmas[i]);
+        }
 
-    float t = -log(currentSigma);
-    float tNext = -log(nextSigma);
-    float h = tNext - t;
+        _sigmas = move(sigmas);
+        _timesteps = move(timesteps);
 
-    Tensor denoised;
-    if (!_previousPredictedSample || nextSigma == 0)
-    {
-      denoised = predictedOriginalSample;
     }
-    else
+
+    Tensor DpmPlusPlus2MScheduler::ApplyStep(const Tensor& input, const Tensor& output, size_t step)
     {
-      float hLast = t - -log(_sigmas[step - 1]);
-      float r = hLast / h;
+        auto currentSigma = _sigmas[step];
+        auto nextSigma = _sigmas[step + 1];
 
-      auto x = 1.f + 1.f / (2.f * r);
-      auto y = 1.f / (2.f * r);
+        Tensor predictedOriginalSample;
 
-      denoised = predictedOriginalSample.BinaryOperation<float>(_previousPredictedSample, [=](float a, float b) {
-        return x * a - y * b;
-        });
-    }
+        if (_predictiontype == StableDiffusionSchedulerPredictionType::V) 
+        {
 
-    if (nextSigma != 0)
-    {
-      _previousPredictedSample = predictedOriginalSample;
-    }
-    else
-    {
-      _previousPredictedSample.Reset();
-    }
+            predictedOriginalSample = output.BinaryOperation<float>(input, [currentSigma](float model_output, float sample) {
+                float sigmaSquaredPlusOne = currentSigma * currentSigma + 1;
+                return (model_output * (-currentSigma / std::sqrt(sigmaSquaredPlusOne))) + (sample / sigmaSquaredPlusOne);
+                });
+
+        }
+        else if (_predictiontype == StableDiffusionSchedulerPredictionType::Epsilon)
+        {
+            predictedOriginalSample = input.BinaryOperation<float>(output, [currentSigma](float a, float b) { return a - currentSigma * b; });
 
-    float x = nextSigma / currentSigma;
-    float y = exp(-h) - 1.f;
-    return input.BinaryOperation<float>(denoised, [=](float a, float b) {
-      return a * x - b * y;
-      });
-  }
-}
+        }
+        else
+        {
+            throw std::invalid_argument("Uninmplemented prediction type.");
+
+        }
+
+        float t = -log(currentSigma);
+        float tNext = -log(nextSigma);
+        float h = tNext - t;
+
+        Tensor denoised;
+        if (!_previousPredictedSample || nextSigma == 0)
+        {
+            denoised = predictedOriginalSample;
+        }
+        else
+        {
+            float hLast = t - -log(_sigmas[step - 1]);
+            float r = hLast / h;
+
+            auto x = 1.f + 1.f / (2.f * r);
+            auto y = 1.f / (2.f * r);
+
+            denoised = predictedOriginalSample.BinaryOperation<float>(_previousPredictedSample, [=](float a, float b) {
+                return x * a - y * b;
+                });
+        }
+
+        if (nextSigma != 0)
+        {
+            _previousPredictedSample = predictedOriginalSample;
+        }
+        else
+        {
+            _previousPredictedSample.Reset();
+        }
+
+        float x = nextSigma / currentSigma;
+        float y = exp(-h) - 1.f;
+        return input.BinaryOperation<float>(denoised, [=](float a, float b) {
+            return a * x - b * y;
+            });
+    }
+}
diff --git a/Axodox.MachineLearning.Shared/MachineLearning/Schedulers/EulerAncestralScheduler.cpp b/Axodox.MachineLearning.Shared/MachineLearning/Schedulers/EulerAncestralScheduler.cpp
@@ -5,28 +5,67 @@ using namespace std;
 
 namespace Axodox::MachineLearning
 {
-  EulerAncestralScheduler::EulerAncestralScheduler(const StableDiffusionSchedulerOptions& options) : 
-    StableDiffusionScheduler(options)
-  { }
-
-  Tensor EulerAncestralScheduler::ApplyStep(const Tensor& input, const Tensor& output, size_t step)
-  {
-    auto currentSigma = _sigmas[step];
-    auto nextSigma = _sigmas[step + 1];
-    auto predictedOriginalSample = input.BinaryOperation<float>(output, [currentSigma](float a, float b) { return a - currentSigma * b; });
-
-    //Get ancestral step
-    auto currentSigmaSquared = currentSigma * currentSigma;
-    auto nextSigmaSquared = nextSigma * nextSigma;
-
-    float sigmaUp = min(nextSigma,
-      sqrt((currentSigmaSquared - nextSigmaSquared) * nextSigmaSquared / currentSigmaSquared));
-    float sigmaDown = sqrt(nextSigmaSquared - sigmaUp * sigmaUp);
-
-    //Euler method
-    float dt = sigmaDown - currentSigma;
-    auto randomNoise = Tensor::CreateRandom(input.Shape, _randoms, sigmaUp);
-    auto latentDelta = randomNoise.BinaryOperation<float>(output, [dt](float a, float b) { return a + dt * b; });
-    return input.BinaryOperation<float>(latentDelta, [](float a, float b) { return a + b; });
-  }
+    EulerAncestralScheduler::EulerAncestralScheduler(const StableDiffusionSchedulerOptions& options) :
+        StableDiffusionScheduler(options)
+    { }
+
+    Tensor EulerAncestralScheduler::ApplyStep(const Tensor& input, const Tensor& output, size_t step)
+    {
+        auto currentSigma = _sigmas[step];
+        auto nextSigma = _sigmas[step + 1];
+
+        Tensor predictedOriginalSample;
+
+        if (_predictiontype == StableDiffusionSchedulerPredictionType::V) 
+        {
+
+            predictedOriginalSample = output.BinaryOperation<float>(input, [currentSigma](float model_output, float sample) {
+                float sigmaSquaredPlusOne = currentSigma * currentSigma + 1;
+                return (model_output * (-currentSigma / std::sqrt(sigmaSquaredPlusOne))) + (sample / sigmaSquaredPlusOne); // note: std::sqrt is VITAL here (???)
+                });
+
+        }
+        else if (_predictiontype == StableDiffusionSchedulerPredictionType::Epsilon) 
+        {
+            predictedOriginalSample = input.BinaryOperation<float>(output, [currentSigma](float a, float b) { return a - currentSigma * b; });
+
+        }
+        else 
+        {
+            throw std::invalid_argument("Uninmplemented prediction type.");
+
+        }
+
+
+        // Calculate sigma squared values for the process
+        auto currentSigmaSquared = currentSigma * currentSigma;
+        auto nextSigmaSquared = nextSigma * nextSigma;
+
+        // Calculate sigma_up and sigma_down according to the Python logic
+        float sigmaUp = std::sqrt(max(0.0f, nextSigmaSquared - currentSigmaSquared));
+        float sigmaDown = std::sqrt(nextSigmaSquared - sigmaUp * sigmaUp);
+
+        // Calculate dt based on sigma changes
+        float dt = sigmaDown - currentSigma;
+
+        // Derivative calculation (the 'derivative' here is conceptual, representing the reverse diffusion step)
+        auto derivative = input.BinaryOperation<float>(predictedOriginalSample, [currentSigma](float inputVal, float predOriginalVal) {
+            return (inputVal - predOriginalVal) / currentSigma;
+            });
+
+        // Update sample with derivative and dt
+        auto updatedSample = input.BinaryOperation<float>(derivative, [dt](float inputVal, float derivativeVal) {
+            return inputVal + derivativeVal * dt;
+            });
+
+        // Generate random noise scaled by sigmaUp
+        auto randomNoise = Tensor::CreateRandom(input.Shape, _randoms, sigmaUp);
+
+        // Add noise to the updated sample
+        updatedSample = updatedSample.BinaryOperation<float>(randomNoise, [](float updatedSampleVal, float noiseVal) {
+            return updatedSampleVal + noiseVal;
+            });
+
+        return updatedSample;
+    }
 }
diff --git a/Axodox.MachineLearning.Shared/MachineLearning/Schedulers/StableDiffusionScheduler.cpp b/Axodox.MachineLearning.Shared/MachineLearning/Schedulers/StableDiffusionScheduler.cpp
@@ -81,6 +81,7 @@ namespace Axodox::MachineLearning
     _trainingSigmas = move(trainingSigmas);
     _sigmas = move(inferenceSigmas);
     _timesteps = move(timesteps);
+    _predictiontype = options.PredictionType;
   }
 
   std::unique_ptr<StableDiffusionScheduler> StableDiffusionScheduler::Create(StableDiffusionSchedulerKind kind, const StableDiffusionSchedulerOptions& options)

diff --git a/Axodox.MachineLearning.Shared/MachineLearning/Schedulers/StableDiffusionScheduler.h b/Axodox.MachineLearning.Shared/MachineLearning/Schedulers/StableDiffusionScheduler.h
@@ -9,12 +9,19 @@ namespace Axodox::MachineLearning
     DpmPlusPlus2M
   };
 
+  enum class StableDiffusionSchedulerPredictionType {
+      Epsilon,
+      V
+  };
+
   struct AXODOX_MACHINELEARNING_API StableDiffusionSchedulerOptions
   {
     size_t TrainStepCount = 1000;
     size_t InferenceStepCount = 20;
     float BetaAtStart = 0.00085f;
     float BetaAtEnd = 0.012f;
+    StableDiffusionSchedulerPredictionType PredictionType = StableDiffusionSchedulerPredictionType::Epsilon;
+
     std::span<const float> BetasTrained;
 
     std::span<std::minstd_rand> Randoms;
@@ -37,6 +44,7 @@ namespace Axodox::MachineLearning
     std::vector<float> _timesteps;
     std::vector<float> _trainingSigmas, _sigmas;
     std::span<std::minstd_rand> _randoms;
+    StableDiffusionSchedulerPredictionType _predictiontype;
 
     float SigmaToTime(float sigma) const;
   };

diff --git a/Axodox.MachineLearning.Shared/MachineLearning/StableDiffustionInferer.cpp b/Axodox.MachineLearning.Shared/MachineLearning/StableDiffustionInferer.cpp
@@ -44,7 +44,7 @@ namespace Axodox::MachineLearning
       context.Randoms.push_back(minstd_rand{ options.Seed + uint32_t(i) });
     }
 
-    context.Scheduler = StableDiffusionScheduler::Create(options.Scheduler, { .InferenceStepCount = options.StepCount, .Randoms = context.Randoms });
+    context.Scheduler = StableDiffusionScheduler::Create(options.Scheduler, { .InferenceStepCount = options.StepCount, .PredictionType = options.PredictionType, .Randoms = context.Randoms });
 
     //Schedule steps
     auto initialStep = size_t(clamp(int(options.StepCount - options.StepCount * options.DenoisingStrength - 1), 0, int(options.StepCount)));

diff --git a/Axodox.MachineLearning.Shared/MachineLearning/StableDiffustionInferer.h b/Axodox.MachineLearning.Shared/MachineLearning/StableDiffustionInferer.h
@@ -27,6 +27,7 @@ namespace Axodox::MachineLearning
     Tensor MaskInput;
     float DenoisingStrength = 1.f;
     StableDiffusionSchedulerKind Scheduler = StableDiffusionSchedulerKind::EulerAncestral;
+    StableDiffusionSchedulerPredictionType PredictionType = StableDiffusionSchedulerPredictionType::Epsilon;
 
     void Validate() const;
   };

diff --git a/Axodox.MachineLearning.Shared/MachineLearning/TextEncoder.cpp b/Axodox.MachineLearning.Shared/MachineLearning/TextEncoder.cpp
@@ -24,6 +24,7 @@ namespace Axodox::MachineLearning
     auto metadata = OnnxModelMetadata::Create(_environment, _session);
     _has64bitInputIds = metadata.Inputs["input_ids"].Type == TensorType::Int64;
     _hasHiddenLayers = metadata.Outputs.contains("hidden_states.11");
+    isSDXL = false;
 
     _logger.log(log_severity::information, "Loaded.");
   }
@@ -32,10 +33,18 @@ namespace Axodox::MachineLearning
   {
     _logger.log(log_severity::information, "Running inference...");
 
+    std::string hiddenStatesLayer = "hidden_states.11";
+
+    // https://github.com/huggingface/diffusers/blob/1f22c9882020cbe2cc08acfee54fab553bbb5678/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py#L387
+    // SDXL has -2 (use penultimate layer) CLIP skip in diffusers for both text encoders
+    // without this, some models generate noise/clouds.
+    if (isSDXL)
+        hiddenStatesLayer = "hidden_states.10";
+
     //Bind values
     IoBinding bindings{ _session };
     bindings.BindInput("input_ids", text.ToInt64(_has64bitInputIds).ToOrtValue());
-    bindings.BindOutput(_hasHiddenLayers ? "hidden_states.11" : "last_hidden_state", _environment->MemoryInfo());
+    bindings.BindOutput(_hasHiddenLayers ? hiddenStatesLayer.c_str() : "last_hidden_state", _environment->MemoryInfo());
 
     //Run inference
     _session.Run({}, bindings);
@@ -76,7 +85,11 @@ namespace Axodox::MachineLearning
     //Bind values
     IoBinding bindings{ _session };
     bindings.BindInput("input_ids", input.ToInt64(_has64bitInputIds).ToOrtValue());
-    bindings.BindOutput("hidden_states.11", _environment->MemoryInfo());
+
+    // https://github.com/huggingface/diffusers/blob/1f22c9882020cbe2cc08acfee54fab553bbb5678/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py#L387
+    // SDXL has -2 (use penultimate layer) CLIP skip in diffusers for both text encoders
+    // without this, some models generate noise/clouds.
+    bindings.BindOutput("hidden_states.31", _environment->MemoryInfo());
     bindings.BindOutput("text_embeds", _environment->MemoryInfo());
 
     //Run inference
@@ -113,6 +126,7 @@ namespace Axodox::MachineLearning
     if (filesystem::exists(get<filesystem::path>(*source), ec))
     {
       _textEncoder2 = make_unique<TextEncoder2>(environment, source);
+      _textEncoder.isSDXL = true;
     }
   }
 

diff --git a/Axodox.MachineLearning.Shared/MachineLearning/TextEncoder.h b/Axodox.MachineLearning.Shared/MachineLearning/TextEncoder.h
@@ -27,6 +27,11 @@ namespace Axodox::MachineLearning
 
     bool _has64bitInputIds;
     bool _hasHiddenLayers;
+
+  protected:
+      friend class TextEncodingProvider;
+      bool isSDXL;
+
   };
 
   class AXODOX_MACHINELEARNING_API TextEncoder2