Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Genetic Tuning Improvements #453

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ You can read about all the parameters here - :ref:`autotuner_parameters`.
- :code:`threads` - set this to number of CPU cores available.
- :code:`generations` - 5 to 10 generations is a good number.
- :code:`pop_size` - 10 is usually reasonable. You can try 10 to 20.
- :code:`number_elites` - number of candidates preserved intact between generations. `1` is usually sufficient.
- :code:`min_launch_total_threads` - If you have really input small sizes, set this to `1`.
- :code:`gpus`: Number of gpus to use for autotuning. Default value is "0". Set this to "0,1" if you wish to use two gpus (for example).

Expand All @@ -70,15 +69,15 @@ kernel timing. You can adopt the following parameter settings as starters for au
.. code::

settings = {
"threads": 32, "generations": 2, "pop_size": 10, "number_elites": 1
"threads": 32, "generations": 2, "pop_size": 10
}

* The good defaults that run for a bit longer (in exchange for better performance):

.. code::

settings = {
"threads": 32, "generations": 5, "pop_size": 10, "number_elites": 1
"threads": 32, "generations": 5, "pop_size": 10
}


Expand All @@ -87,7 +86,7 @@ kernel timing. You can adopt the following parameter settings as starters for au
.. code::

settings = {
"threads": 32, "generations": 25, "pop_size": 100, "number_elites": 10
"threads": 32, "generations": 25, "pop_size": 100
}


Expand Down
2 changes: 1 addition & 1 deletion docs/source/tutorials/tutorial_tensordot_with_tc.rst
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ later.
You can control the amount of autotuning by changing the autotuner parameters. See
:ref:`autotune_parameters` for how to change the settings.

For the setting ``settings={"generations": 25, "pop_size": 100, "number_elites": 10}``, we
For the setting ``settings={"generations": 25, "pop_size": 100}``, we
get a decent kernel performance as shown in the screenshot below (tuned on one M40 GPU):

.. figure:: ../_static/img/autotuning-py.jpg
Expand Down
127 changes: 74 additions & 53 deletions tc/autotuner/autotuner-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,16 +79,16 @@ TuningHarness<Backend>::bestMappingOptions() const {
}

template <typename Backend>
template <typename SearchStrategy>
void TuningHarness<Backend>::doCompile(SearchStrategy& searchStrategy) {
template <typename Candidates>
void TuningHarness<Backend>::doCompile(Candidates& candidates) {
// Atomically fetch and add the next job until there are no jobs left
while (true) {
auto current = currentCompilationJob_.fetch_add(1);
if (current >= searchStrategy.population.size()) {
if (current >= candidates.size()) {
break;
}
std::unique_ptr<typename Backend::ExecutorType> pExecutor(nullptr);
auto pConf = searchStrategy.population.at(current).get();
auto pConf = candidates.at(current).get();
auto options = makeOptions<Backend>(baseMapping_, *pConf);
try {
if (FLAGS_debug_tuner) {
Expand Down Expand Up @@ -243,56 +243,76 @@ void TuningHarness<Backend>::runOneIteration(
size_t iteration) {
// Define tensors per device once globally
auto devices = detail::parseDevices<Backend>(FLAGS_tuner_devices);
CHECK(executors_.empty());
CHECK(configurations_.empty());

{
// Initialize for this round
currentCompilationJob_.store(0);
numEvaluations_.store(0);
Printer printer(
iteration,
searchStrategy.population.size(),
currentCompilationJob_,
numEvaluations_);
auto logIterations = FLAGS_tuner_gen_log_generations;
ScopeGuard sgPrinter([logIterations, &printer]() {
printer.stop();
if (logIterations) {
printer.printAll();
}
});

// Just spawn and join new threads for each iteration
std::vector<std::thread> cpuCompilationThreads;
cpuCompilationThreads.reserve(FLAGS_tuner_threads);
ScopeGuard sgCompilationThreads([&cpuCompilationThreads]() {
for (auto& cpuCompilationThread : cpuCompilationThreads) {
cpuCompilationThread.join();
}
});
for (size_t i = 0; i < FLAGS_tuner_threads; ++i) {
cpuCompilationThreads.emplace_back(
[this, &searchStrategy]() { this->doCompile(searchStrategy); });
}
for (uint64_t step = 0; step < searchStrategy.stepsPerIteration; ++step) {
{
CHECK(executors_.empty());
CHECK(configurations_.empty());
auto& candidates = searchStrategy.candidatesOfStep(step);
auto firstNew = std::partition(
candidates.begin(),
candidates.end(),
[](const std::unique_ptr<CandidateConfiguration>& c) {
return c->runtime != Duration::zero();
});
GeneticSearch::Population newCandidates(
std::distance(firstNew, candidates.end()));
std::move(firstNew, candidates.end(), newCandidates.begin());
ScopeGuard candidatesSG([&]() {
std::move(newCandidates.begin(), newCandidates.end(), firstNew);
});

// Just spawn and join new threads for each device
std::vector<std::thread> workerThreads;
workerThreads.reserve(devices.size());
LOG_IF(INFO, tc::FLAGS_debug_tuner)
<< "Start evaluation: " << devices.size() << " " << executors_.size()
<< " " << configurations_.size();
ScopeGuard sgDeviceWorkerThreads([&workerThreads]() {
for (auto& workerThread : workerThreads) {
workerThread.join();
if (not newCandidates.empty()) {
auto populationSize = newCandidates.size();
// Initialize for this round
currentCompilationJob_.store(0);
numEvaluations_.store(0);
Printer printer(
iteration,
step,
populationSize,
currentCompilationJob_,
numEvaluations_);
auto logIterations = FLAGS_tuner_gen_log_generations;
ScopeGuard sgPrinter([logIterations, &printer]() {
printer.stop();
if (logIterations) {
printer.printAll();
}
});

// Just spawn and join new threads for each iteration
std::vector<std::thread> cpuCompilationThreads;
cpuCompilationThreads.reserve(FLAGS_tuner_threads);
ScopeGuard sgCompilationThreads([&cpuCompilationThreads]() {
for (auto& cpuCompilationThread : cpuCompilationThreads) {
cpuCompilationThread.join();
}
});
for (size_t i = 0; i < FLAGS_tuner_threads; ++i) {
cpuCompilationThreads.emplace_back(
[this, &newCandidates]() { this->doCompile(newCandidates); });
}

// Just spawn and join new threads for each device
std::vector<std::thread> workerThreads;
workerThreads.reserve(devices.size());
LOG_IF(INFO, tc::FLAGS_debug_tuner)
<< "Start evaluation: " << devices.size() << " "
<< executors_.size() << " " << configurations_.size();
ScopeGuard sgDeviceWorkerThreads([&workerThreads]() {
for (auto& workerThread : workerThreads) {
workerThread.join();
}
});
for (auto device : devices) {
workerThreads.emplace_back(
[this, device, populationSize, &printer]() {
this->doEvaluate(device, populationSize, printer);
});
}
}
});
auto populationSize = searchStrategy.population.size();
for (auto device : devices) {
workerThreads.emplace_back([this, device, populationSize, &printer]() {
this->doEvaluate(device, populationSize, printer);
});
}
searchStrategy.finishStep(step);
}

// At this point everything is synchronized because out of scope, done
Expand All @@ -303,7 +323,6 @@ void TuningHarness<Backend>::runOneIteration(
infoPrinter << bestMappingOptions();
LOG_LINE_BY_LINE(INFO, ssInfo);
}
searchStrategy.updateParameters();
}
} // namespace detail

Expand Down Expand Up @@ -460,13 +479,15 @@ Autotuner<Backend, SearchStrategy>::tune(
});

// searchStrategy is passed to tuningHarness.run()
// XXX: this not generic
SearchStrategy searchStrategy(
configs,
FLAGS_tuner_gen_generations,
FLAGS_tuner_gen_pop_size,
FLAGS_tuner_gen_crossover_rate,
FLAGS_tuner_gen_mutation_rate,
FLAGS_tuner_gen_number_elites);
FLAGS_tuner_gen_mating_pool_size,
FLAGS_tuner_gen_selection_pool_size);

// Create a tuning harness
detail::TuningHarness<Backend> tuningHarness(
Expand Down
Loading