From b0d2f839bc4bc83b053eddb7c7255830b97d2564 Mon Sep 17 00:00:00 2001 From: lawnjelly Date: Mon, 25 Jul 2022 11:06:19 +0100 Subject: [PATCH] Optimize AudioServer::_driver_process() Move expensive calculations outside inner hot loops. --- servers/audio_server.cpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/servers/audio_server.cpp b/servers/audio_server.cpp index 411ad2e7109..6460c906481 100644 --- a/servers/audio_server.cpp +++ b/servers/audio_server.cpp @@ -261,7 +261,14 @@ void AudioServer::_driver_process(int p_frames, int32_t *p_buffer) { //master master, send to output int cs = master->channels.size(); + + // take away 1 from the stride as we are manually incrementing one for stereo + uintptr_t stride_minus_one = (cs * 2) - 1; + for (int k = 0; k < cs; k++) { + // destination start for data will be the same in all cases + int32_t *dest = &p_buffer[(from_buf * (cs * 2)) + (k * 2)]; + if (master->channels[k].active) { const AudioFrame *buf = master->channels[k].buffer.ptr(); @@ -269,18 +276,25 @@ void AudioServer::_driver_process(int p_frames, int32_t *p_buffer) { float l = CLAMP(buf[from + j].l, -1.0, 1.0); int32_t vl = l * ((1 << 20) - 1); int32_t vl2 = (vl < 0 ? -1 : 1) * (ABS(vl) << 11); - p_buffer[(from_buf + j) * (cs * 2) + k * 2 + 0] = vl2; + *dest = vl2; + dest++; float r = CLAMP(buf[from + j].r, -1.0, 1.0); int32_t vr = r * ((1 << 20) - 1); int32_t vr2 = (vr < 0 ? -1 : 1) * (ABS(vr) << 11); - p_buffer[(from_buf + j) * (cs * 2) + k * 2 + 1] = vr2; + *dest = vr2; + dest += stride_minus_one; } } else { + // Bizarrely, profiling indicates that detecting the common case of cs == 1 + // and k == 0, and using memset is SLOWER than setting individually. + // (Perhaps it gets optimized to a faster instruction than memset). for (int j = 0; j < to_copy; j++) { - p_buffer[(from_buf + j) * (cs * 2) + k * 2 + 0] = 0; - p_buffer[(from_buf + j) * (cs * 2) + k * 2 + 1] = 0; + *dest = 0; + dest++; + *dest = 0; + dest += stride_minus_one; } } }