Skip to content

Commit 5e12f91

Browse files
authored
[ISTFT][Op][CPU][Ref] Audio results noise reduction (#29475)
PR to the master branch ### Details: - Audio results noise reduction (Update of reference and CPU implementation of ISTFT) - Multiplication of irdft results before add & Normalization update - Verified on the model from the ticket ----------------- *Further work: Update tests to be able detect smaller differences ### Tickets: - 160711 (For related model) PR to the 2025.1: - #29533
1 parent 2b477c1 commit 5e12f91

File tree

2 files changed

+22
-29
lines changed

2 files changed

+22
-29
lines changed

src/core/reference/src/op/istft.cpp

+11-13
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,10 @@ void istft(const float* in_data,
5151
const auto window_length = window_shape[0] < frame_size_dim ? window_shape[0] : frame_size_dim;
5252
std::vector<float> pad_window(frame_size, 0);
5353
std::copy(window, window + window_shape[0], pad_window.begin() + (frame_size_dim - window_length) / 2);
54+
std::vector<float> pow_window(frame_size, 0);
55+
std::transform(pad_window.begin(), pad_window.end(), pow_window.begin(), [](float win_val) {
56+
return win_val * win_val;
57+
});
5458

5559
std::vector<float> data_t(in_data, in_data + shape_size(data_shape));
5660
const auto stft_transp_out_shape = Shape{batch_size, num_frames, fft_out_shape[0], fft_out_shape[1]};
@@ -94,9 +98,7 @@ void istft(const float* in_data,
9498
for (size_t frame_idx = 0; frame_idx < num_frames; ++frame_idx) {
9599
const auto in_frame_start = batch_in_start + frame_idx * fft_out_shape_size;
96100
const auto in_frame_end = in_frame_start + fft_out_shape_size;
97-
98101
const auto out_frame_start = batch_out_start + frame_idx * frame_step;
99-
const auto out_frame_end = out_frame_start + frame_size;
100102

101103
std::vector<float> frame_data(data_t.data() + in_frame_start, data_t.data() + in_frame_end);
102104
reference::irdft(frame_data,
@@ -107,17 +109,13 @@ void istft(const float* in_data,
107109
frame_size_dim_shape,
108110
frame_size);
109111

110-
std::transform(frame_signal.begin(),
111-
frame_signal.end(),
112-
mid_result.begin() + out_frame_start,
113-
mid_result.begin() + out_frame_start,
114-
func::add<float>);
115-
116-
std::transform(window_sum.begin() + out_frame_start,
117-
window_sum.begin() + out_frame_end,
118-
pad_window.begin(),
119-
window_sum.begin() + out_frame_start,
120-
func::add<float>);
112+
// Overlap Add
113+
float* mid_result_sum = mid_result.data() + out_frame_start;
114+
float* window_frame_sum = window_sum.data() + out_frame_start;
115+
for (size_t i = 0; i < frame_signal.size(); ++i) {
116+
mid_result_sum[i] += frame_signal[i] * pad_window[i];
117+
window_frame_sum[i] += pow_window[i];
118+
}
121119
}
122120

123121
std::transform(result, result + signal_length, window_sum.begin() + batch_out_start, result, postprocess_func);

src/plugins/intel_cpu/src/nodes/istft.cpp

+11-16
Original file line numberDiff line numberDiff line change
@@ -136,13 +136,15 @@ void istft_impl(const float* in_data,
136136
OPENVINO_ASSERT(fft_results_dim == static_cast<size_t>((frame_size / 2) + 1));
137137

138138
const auto frame_size_dim = static_cast<size_t>(frame_size);
139-
const auto frame_size_dim_shape = ov::Shape{frame_size_dim};
140-
const auto frame_size_dim_shape_out = ov::Shape{frame_size_dim, 2};
141139
const auto fft_out_shape = ov::Shape{fft_results_dim, 2};
142140

143141
const auto window_length = window_shape[0] < frame_size_dim ? window_shape[0] : frame_size_dim;
144142
std::vector<float> pad_window(frame_size, 0);
145143
std::copy(window, window + window_shape[0], pad_window.begin() + (frame_size_dim - window_length) / 2);
144+
std::vector<float> pow_window(frame_size, 0);
145+
std::transform(pad_window.begin(), pad_window.end(), pow_window.begin(), [](float win_val) {
146+
return win_val * win_val;
147+
});
146148

147149
std::vector<float> data_t(shape_size(data_shape));
148150
const auto stft_transp_out_shape = ov::Shape{batch_size, num_frames, fft_out_shape[0], fft_out_shape[1]};
@@ -187,9 +189,7 @@ void istft_impl(const float* in_data,
187189
size_t batch_out_start = batch * signal_length;
188190

189191
const auto in_frame_start = batch_in_start + frame_idx * fft_out_shape_size;
190-
191192
const auto out_frame_start = batch_out_start + frame_idx * frame_step;
192-
const auto out_frame_end = out_frame_start + frame_size;
193193

194194
std::vector<float> frame_signal(frame_size);
195195
rdft_executor->execute(data_t.data() + in_frame_start,
@@ -203,25 +203,20 @@ void istft_impl(const float* in_data,
203203
{1},
204204
{1});
205205

206-
std::transform(frame_signal.begin(),
207-
frame_signal.end(),
208-
mid_result.begin() + out_frame_start,
209-
mid_result.begin() + out_frame_start,
210-
std::plus<>());
211-
212-
std::transform(window_sum.begin() + out_frame_start,
213-
window_sum.begin() + out_frame_end,
214-
pad_window.begin(),
215-
window_sum.begin() + out_frame_start,
216-
std::plus<>());
206+
// Overlap Add
207+
float* mid_result_sum = mid_result.data() + out_frame_start;
208+
float* window_frame_sum = window_sum.data() + out_frame_start;
209+
for (size_t i = 0; i < frame_signal.size(); ++i) {
210+
mid_result_sum[i] += frame_signal[i] * pad_window[i];
211+
window_frame_sum[i] += pow_window[i];
212+
}
217213
}
218214
float* result = mid_result.data() + (batch * signal_length);
219215
std::transform(result,
220216
result + signal_length,
221217
window_sum.begin() + batch * signal_length,
222218
result,
223219
postprocess_func);
224-
225220
const auto result_start = result + margin;
226221
std::copy(result_start, result_start + copy_end, final_result + batch * final_signal_length);
227222
});

0 commit comments

Comments
 (0)