Skip to content

Support v-prediction objective, fix some SDXL models outputting clouds #26

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,78 +5,96 @@ using namespace std;

namespace Axodox::MachineLearning
{
DpmPlusPlus2MScheduler::DpmPlusPlus2MScheduler(const StableDiffusionSchedulerOptions& options) :
StableDiffusionScheduler(options)
{
//Apply Karras sigmas
const auto rho = 7.f;

auto sigmaMax = _sigmas.front();
auto sigmaMin = *(_sigmas.end() - 2);

auto invRhoMin = pow(sigmaMin, 1.f / rho);
auto invRhoMax = pow(sigmaMax, 1.f / rho);

auto stepCount = _sigmas.size() - 1;
auto stepSize = 1.f / (stepCount - 1);
vector<float> timesteps(_timesteps.size());
vector<float> sigmas(_sigmas.size());
for (auto i = 0; i < stepCount; i++)
DpmPlusPlus2MScheduler::DpmPlusPlus2MScheduler(const StableDiffusionSchedulerOptions& options) :
StableDiffusionScheduler(options)
{
auto t = i * stepSize;
sigmas[i] = pow(invRhoMax + t * (invRhoMin - invRhoMax), rho);
timesteps[i] = SigmaToTime(sigmas[i]);
}

_sigmas = move(sigmas);
_timesteps = move(timesteps);
//Apply Karras sigmas
const auto rho = 7.f;

//_sigmas = { 14.614643096923828f, 10.81978988647461f, 7.9029436111450195f, 5.687837600708008f, 4.027743816375732f, 2.8015711307525635f, 1.9103621244430542f, 1.2741049528121948f, 0.8288719058036804f, 0.524259626865387f, 0.3211216926574707f, 0.18956425786018372f, 0.10719937086105347f, 0.05763240531086922f, 0.02916753850877285f, 0.f };
//_timesteps = { 999.f, 947.6224f, 889.5464f, 823.0464f, 745.8676f, 655.3113f, 549.0170f, 427.4898f, 298.6582f, 179.8307f, 89.9427f, 36.5918f, 12.0011f, 2.8839f, 0.f };
}
auto sigmaMax = _sigmas.front();
auto sigmaMin = *(_sigmas.end() - 2);

Tensor DpmPlusPlus2MScheduler::ApplyStep(const Tensor& input, const Tensor& output, size_t step)
{
auto currentSigma = _sigmas[step];
auto nextSigma = _sigmas[step + 1];
auto invRhoMin = pow(sigmaMin, 1.f / rho);
auto invRhoMax = pow(sigmaMax, 1.f / rho);

auto predictedOriginalSample = input.BinaryOperation<float>(output, [currentSigma](float a, float b) { return a - currentSigma * b; });
auto stepCount = _sigmas.size() - 1;
auto stepSize = 1.f / (stepCount - 1);
vector<float> timesteps(_timesteps.size());
vector<float> sigmas(_sigmas.size());
for (auto i = 0; i < stepCount; i++)
{
auto t = i * stepSize;
sigmas[i] = pow(invRhoMax + t * (invRhoMin - invRhoMax), rho);
timesteps[i] = SigmaToTime(sigmas[i]);
}

float t = -log(currentSigma);
float tNext = -log(nextSigma);
float h = tNext - t;
_sigmas = move(sigmas);
_timesteps = move(timesteps);

Tensor denoised;
if (!_previousPredictedSample || nextSigma == 0)
{
denoised = predictedOriginalSample;
}
else

Tensor DpmPlusPlus2MScheduler::ApplyStep(const Tensor& input, const Tensor& output, size_t step)
{
float hLast = t - -log(_sigmas[step - 1]);
float r = hLast / h;
auto currentSigma = _sigmas[step];
auto nextSigma = _sigmas[step + 1];

auto x = 1.f + 1.f / (2.f * r);
auto y = 1.f / (2.f * r);
Tensor predictedOriginalSample;

denoised = predictedOriginalSample.BinaryOperation<float>(_previousPredictedSample, [=](float a, float b) {
return x * a - y * b;
});
}
if (_predictiontype == StableDiffusionSchedulerPredictionType::V)
{

if (nextSigma != 0)
{
_previousPredictedSample = predictedOriginalSample;
}
else
{
_previousPredictedSample.Reset();
}
predictedOriginalSample = output.BinaryOperation<float>(input, [currentSigma](float model_output, float sample) {
float sigmaSquaredPlusOne = currentSigma * currentSigma + 1;
return (model_output * (-currentSigma / std::sqrt(sigmaSquaredPlusOne))) + (sample / sigmaSquaredPlusOne);
});

}
else if (_predictiontype == StableDiffusionSchedulerPredictionType::Epsilon)
{
predictedOriginalSample = input.BinaryOperation<float>(output, [currentSigma](float a, float b) { return a - currentSigma * b; });

float x = nextSigma / currentSigma;
float y = exp(-h) - 1.f;
return input.BinaryOperation<float>(denoised, [=](float a, float b) {
return a * x - b * y;
});
}
}
}
else
{
throw std::invalid_argument("Uninmplemented prediction type.");

}

float t = -log(currentSigma);
float tNext = -log(nextSigma);
float h = tNext - t;

Tensor denoised;
if (!_previousPredictedSample || nextSigma == 0)
{
denoised = predictedOriginalSample;
}
else
{
float hLast = t - -log(_sigmas[step - 1]);
float r = hLast / h;

auto x = 1.f + 1.f / (2.f * r);
auto y = 1.f / (2.f * r);

denoised = predictedOriginalSample.BinaryOperation<float>(_previousPredictedSample, [=](float a, float b) {
return x * a - y * b;
});
}

if (nextSigma != 0)
{
_previousPredictedSample = predictedOriginalSample;
}
else
{
_previousPredictedSample.Reset();
}

float x = nextSigma / currentSigma;
float y = exp(-h) - 1.f;
return input.BinaryOperation<float>(denoised, [=](float a, float b) {
return a * x - b * y;
});
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,67 @@ using namespace std;

namespace Axodox::MachineLearning
{
EulerAncestralScheduler::EulerAncestralScheduler(const StableDiffusionSchedulerOptions& options) :
StableDiffusionScheduler(options)
{ }

Tensor EulerAncestralScheduler::ApplyStep(const Tensor& input, const Tensor& output, size_t step)
{
auto currentSigma = _sigmas[step];
auto nextSigma = _sigmas[step + 1];
auto predictedOriginalSample = input.BinaryOperation<float>(output, [currentSigma](float a, float b) { return a - currentSigma * b; });

//Get ancestral step
auto currentSigmaSquared = currentSigma * currentSigma;
auto nextSigmaSquared = nextSigma * nextSigma;

float sigmaUp = min(nextSigma,
sqrt((currentSigmaSquared - nextSigmaSquared) * nextSigmaSquared / currentSigmaSquared));
float sigmaDown = sqrt(nextSigmaSquared - sigmaUp * sigmaUp);

//Euler method
float dt = sigmaDown - currentSigma;
auto randomNoise = Tensor::CreateRandom(input.Shape, _randoms, sigmaUp);
auto latentDelta = randomNoise.BinaryOperation<float>(output, [dt](float a, float b) { return a + dt * b; });
return input.BinaryOperation<float>(latentDelta, [](float a, float b) { return a + b; });
}
EulerAncestralScheduler::EulerAncestralScheduler(const StableDiffusionSchedulerOptions& options) :
StableDiffusionScheduler(options)
{ }

Tensor EulerAncestralScheduler::ApplyStep(const Tensor& input, const Tensor& output, size_t step)
{
auto currentSigma = _sigmas[step];
auto nextSigma = _sigmas[step + 1];

Tensor predictedOriginalSample;

if (_predictiontype == StableDiffusionSchedulerPredictionType::V)
{

predictedOriginalSample = output.BinaryOperation<float>(input, [currentSigma](float model_output, float sample) {
float sigmaSquaredPlusOne = currentSigma * currentSigma + 1;
return (model_output * (-currentSigma / std::sqrt(sigmaSquaredPlusOne))) + (sample / sigmaSquaredPlusOne); // note: std::sqrt is VITAL here (???)
});

}
else if (_predictiontype == StableDiffusionSchedulerPredictionType::Epsilon)
{
predictedOriginalSample = input.BinaryOperation<float>(output, [currentSigma](float a, float b) { return a - currentSigma * b; });

}
else
{
throw std::invalid_argument("Uninmplemented prediction type.");

}


// Calculate sigma squared values for the process
auto currentSigmaSquared = currentSigma * currentSigma;
auto nextSigmaSquared = nextSigma * nextSigma;

// Calculate sigma_up and sigma_down according to the Python logic
float sigmaUp = std::sqrt(max(0.0f, nextSigmaSquared - currentSigmaSquared));
float sigmaDown = std::sqrt(nextSigmaSquared - sigmaUp * sigmaUp);

// Calculate dt based on sigma changes
float dt = sigmaDown - currentSigma;

// Derivative calculation (the 'derivative' here is conceptual, representing the reverse diffusion step)
auto derivative = input.BinaryOperation<float>(predictedOriginalSample, [currentSigma](float inputVal, float predOriginalVal) {
return (inputVal - predOriginalVal) / currentSigma;
});

// Update sample with derivative and dt
auto updatedSample = input.BinaryOperation<float>(derivative, [dt](float inputVal, float derivativeVal) {
return inputVal + derivativeVal * dt;
});

// Generate random noise scaled by sigmaUp
auto randomNoise = Tensor::CreateRandom(input.Shape, _randoms, sigmaUp);

// Add noise to the updated sample
updatedSample = updatedSample.BinaryOperation<float>(randomNoise, [](float updatedSampleVal, float noiseVal) {
return updatedSampleVal + noiseVal;
});

return updatedSample;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ namespace Axodox::MachineLearning
_trainingSigmas = move(trainingSigmas);
_sigmas = move(inferenceSigmas);
_timesteps = move(timesteps);
_predictiontype = options.PredictionType;
}

std::unique_ptr<StableDiffusionScheduler> StableDiffusionScheduler::Create(StableDiffusionSchedulerKind kind, const StableDiffusionSchedulerOptions& options)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,19 @@ namespace Axodox::MachineLearning
DpmPlusPlus2M
};

enum class StableDiffusionSchedulerPredictionType {
Epsilon,
V
};

struct AXODOX_MACHINELEARNING_API StableDiffusionSchedulerOptions
{
size_t TrainStepCount = 1000;
size_t InferenceStepCount = 20;
float BetaAtStart = 0.00085f;
float BetaAtEnd = 0.012f;
StableDiffusionSchedulerPredictionType PredictionType = StableDiffusionSchedulerPredictionType::Epsilon;

std::span<const float> BetasTrained;

std::span<std::minstd_rand> Randoms;
Expand All @@ -37,6 +44,7 @@ namespace Axodox::MachineLearning
std::vector<float> _timesteps;
std::vector<float> _trainingSigmas, _sigmas;
std::span<std::minstd_rand> _randoms;
StableDiffusionSchedulerPredictionType _predictiontype;

float SigmaToTime(float sigma) const;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ namespace Axodox::MachineLearning
context.Randoms.push_back(minstd_rand{ options.Seed + uint32_t(i) });
}

context.Scheduler = StableDiffusionScheduler::Create(options.Scheduler, { .InferenceStepCount = options.StepCount, .Randoms = context.Randoms });
context.Scheduler = StableDiffusionScheduler::Create(options.Scheduler, { .InferenceStepCount = options.StepCount, .PredictionType = options.PredictionType, .Randoms = context.Randoms });

//Schedule steps
auto initialStep = size_t(clamp(int(options.StepCount - options.StepCount * options.DenoisingStrength - 1), 0, int(options.StepCount)));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ namespace Axodox::MachineLearning
Tensor MaskInput;
float DenoisingStrength = 1.f;
StableDiffusionSchedulerKind Scheduler = StableDiffusionSchedulerKind::EulerAncestral;
StableDiffusionSchedulerPredictionType PredictionType = StableDiffusionSchedulerPredictionType::Epsilon;

void Validate() const;
};
Expand Down
18 changes: 16 additions & 2 deletions Axodox.MachineLearning.Shared/MachineLearning/TextEncoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ namespace Axodox::MachineLearning
auto metadata = OnnxModelMetadata::Create(_environment, _session);
_has64bitInputIds = metadata.Inputs["input_ids"].Type == TensorType::Int64;
_hasHiddenLayers = metadata.Outputs.contains("hidden_states.11");
isSDXL = false;

_logger.log(log_severity::information, "Loaded.");
}
Expand All @@ -32,10 +33,18 @@ namespace Axodox::MachineLearning
{
_logger.log(log_severity::information, "Running inference...");

std::string hiddenStatesLayer = "hidden_states.11";

// https://github.com/huggingface/diffusers/blob/1f22c9882020cbe2cc08acfee54fab553bbb5678/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py#L387
// SDXL has -2 (use penultimate layer) CLIP skip in diffusers for both text encoders
// without this, some models generate noise/clouds.
if (isSDXL)
hiddenStatesLayer = "hidden_states.10";

//Bind values
IoBinding bindings{ _session };
bindings.BindInput("input_ids", text.ToInt64(_has64bitInputIds).ToOrtValue());
bindings.BindOutput(_hasHiddenLayers ? "hidden_states.11" : "last_hidden_state", _environment->MemoryInfo());
bindings.BindOutput(_hasHiddenLayers ? hiddenStatesLayer.c_str() : "last_hidden_state", _environment->MemoryInfo());

//Run inference
_session.Run({}, bindings);
Expand Down Expand Up @@ -76,7 +85,11 @@ namespace Axodox::MachineLearning
//Bind values
IoBinding bindings{ _session };
bindings.BindInput("input_ids", input.ToInt64(_has64bitInputIds).ToOrtValue());
bindings.BindOutput("hidden_states.11", _environment->MemoryInfo());

// https://github.com/huggingface/diffusers/blob/1f22c9882020cbe2cc08acfee54fab553bbb5678/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py#L387
// SDXL has -2 (use penultimate layer) CLIP skip in diffusers for both text encoders
// without this, some models generate noise/clouds.
bindings.BindOutput("hidden_states.31", _environment->MemoryInfo());
bindings.BindOutput("text_embeds", _environment->MemoryInfo());

//Run inference
Expand Down Expand Up @@ -113,6 +126,7 @@ namespace Axodox::MachineLearning
if (filesystem::exists(get<filesystem::path>(*source), ec))
{
_textEncoder2 = make_unique<TextEncoder2>(environment, source);
_textEncoder.isSDXL = true;
}
}

Expand Down
5 changes: 5 additions & 0 deletions Axodox.MachineLearning.Shared/MachineLearning/TextEncoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ namespace Axodox::MachineLearning

bool _has64bitInputIds;
bool _hasHiddenLayers;

protected:
friend class TextEncodingProvider;
bool isSDXL;

};

class AXODOX_MACHINELEARNING_API TextEncoder2
Expand Down