From 3721da90dccde88f472a568e6b931558e146e679 Mon Sep 17 00:00:00 2001
From: Dietmar Maurer <dietmar@proxmox.com>
Date: Fri, 4 Apr 2025 11:28:37 +0200
Subject: [PATCH] audio: implement re-sync, move code to separate file

Signed-off-by: Dietmar Maurer <dietmar@proxmox.com>
---
 core/audio.js | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++
 core/rfb.js   |  67 +++++---------------
 2 files changed, 187 insertions(+), 50 deletions(-)
 create mode 100644 core/audio.js

diff --git a/core/audio.js b/core/audio.js
new file mode 100644
index 00000000..78992caa
--- /dev/null
+++ b/core/audio.js
@@ -0,0 +1,170 @@
+// The RFB protocol (VNC) is designed for real-time user interactions
+// and allows transferring audio messages together with screen content.
+// It is not possible to use any kind of buffering, because that would
+// introduce large delays between user interaction and content display.
+//
+// This is not really a problem with screen content, because the human
+// brain is quite tolerate about slight speed changes in video content,
+// and we mostly transfer non-video data anyways.
+//
+// With audio, the situation is quite different, as it must be played
+// at a constant speed. Any delay leads to audio distortion, which is
+// unpleasant for humans.
+//
+// Without buffering, it is always possible for audio frames to arrive
+// too late or too early due to changing network speeds.
+//
+// We use the following algorithm:
+//
+// - small Jitter buffer to tolerate small speed changes (20ms)
+// - simply discard late audio frame
+// - Queue early frames with slight speedup (pitch scale) to re-sync audio
+// - if we get to many early frames, skip frames for fast re-sync
+//
+// ## Audio format
+//
+// We use/expect U16 raw audio data.
+
+import * as Log from './util/logging.js';
+
+export default class Audio {
+    constructor(sample_rate, nchannels) {
+        this._next_start = 0;
+        this._context = null;
+        this._jitter = 0.02;
+        this._resample_trigger = 5*this._jitter;
+        this._stable_time = 1.0;
+
+        // ===== PROPERTIES =====
+        this._sample_rate = sample_rate;
+        this._nchannels = nchannels;
+    }
+
+    // ===== PROPERTIES =====
+    get sample_rate() { return this._sample_rate; }
+    get nchannels() { return this._nchannels; }
+
+    // ===== PUBLIC METHODS =====
+
+    // Stop audio playback
+    //
+    // Further audio frames are simply dropped.
+    stop() {
+        this._context = null;
+        this._next_start = 0;
+    }
+
+    start() {
+        this._context = new AudioContext({
+            latencyHint: "interactive",
+            sampleRate: this._sample_rate,
+        });
+        this._next_start = 0;
+    }
+
+    play(payload) {
+        if (this._context === null) {
+            return true;
+        }
+
+        let ctime = this._context.currentTime;
+
+        let time_offset = this._next_start - ctime;
+
+        let sample_bytes = 2*this._nchannels;
+
+        if ((time_offset < this._jitter) && (this._resample_trigger !== 5*this._jitter)) {
+            Log.Debug("Stop resampling because audio is in sync (delay = " + time_offset + " sec)");
+            this._resample_trigger = 5*this._jitter;
+        }
+
+        let buffer = null;
+        if (time_offset > this._resample_trigger && (payload.length > (100*sample_bytes))) {
+            if (this._resample_trigger !== this._jitter) {
+                Log.Debug("Start resampling to re-sync audio (delay = " + time_offset + " sec)");
+                this._resample_trigger = this._jitter;
+            }
+            buffer = this._pitchScale(payload, 1.01); // increase pitch by 1%
+        } else {
+            buffer = this._createBuffer(payload);
+        }
+
+        if (this._next_start > 0) {
+            if (time_offset < -buffer.duration) {
+                Log.Warn("Skip delayed audio frame (delay = " + (-time_offset) + " sec)");
+                this._next_start = ctime + this._jitter;
+                return true; // do not play delayed frame - skip it!
+            }
+            if  (time_offset > 0.5) {
+                Log.Warn("Move fast audio frame (offset = " + time_offset + " sec)");
+                this._stable_time = 0;
+                return true; // skip frame.
+            }
+        }
+
+        this._stable_time += buffer.duration;
+
+        if (this._next_start === 0) {
+            this._next_start = ctime + this._jitter;
+        }
+
+        let start_time = this._next_start;
+        this._next_start += buffer.duration;
+
+        if (this._stable_time >= 1.0) {
+            let source = this._context.createBufferSource();
+            source.buffer = buffer;
+            source.connect(this._context.destination);
+            source.start(start_time);
+        }
+
+        return true;
+    }
+
+    // ===== PRIVATE METHODS =====
+
+    // see: https://en.wikipedia.org/wiki/Audio_time_stretching_and_pitch_scaling
+    _pitchScale(payload, factor) {
+        let sample_bytes = 2*this._nchannels;
+        let new_length = Math.ceil(payload.length/(factor*sample_bytes));
+
+        let buffer = this._context.createBuffer(this._nchannels, new_length, this._sample_rate);
+        for (let ch = 0; ch < this._nchannels; ch++) {
+            const channel = buffer.getChannelData(ch);
+            let channel_offset = ch*2;
+            for (let i = 0; i < buffer.length; i++) {
+                let pos_float = i*factor;
+                let j = Math.trunc(pos_float);
+                let second_weight = pos_float % 1;
+                let first_weight = 1 - second_weight;
+                let p = j*sample_bytes + channel_offset;
+                let value0 = payload[p] + payload[p+1]*256;
+                p += sample_bytes;
+                let value1 = value0;
+                if (p < payload.length) {
+                    value1 = payload[p] + payload[p+1]*256;
+                }
+                let value = (value0*first_weight + value1*second_weight);
+                channel[i] = (value / 32768.0) - 1.0;
+            }
+        }
+        return buffer;
+    }
+
+    _createBuffer(payload) {
+        let sample_bytes = 2*this._nchannels;
+        let buffer = this._context.createBuffer(
+            this._nchannels, payload.length/sample_bytes, this._sample_rate);
+
+        for (let ch = 0; ch < this._nchannels; ch++) {
+            const channel = buffer.getChannelData(ch);
+            let channel_offset = ch*2;
+            for (let i = 0; i < buffer.length; i++) {
+                let p = i*sample_bytes + channel_offset;
+                let value = payload[p] + payload[p+1]*256;
+                channel[i] = (value / 32768.0) - 1.0;
+            }
+        }
+        return buffer;
+    }
+}
diff --git a/core/rfb.js b/core/rfb.js
index 4f5a51d2..5368e52d 100644
--- a/core/rfb.js
+++ b/core/rfb.js
@@ -14,6 +14,7 @@ import { dragThreshold, supportsWebCodecsH264Decode } from './util/browser.js';
 import { clientToElement } from './util/element.js';
 import { setCapture } from './util/events.js';
 import EventTargetMixin from './util/eventtarget.js';
+import Audio from "./audio.js";
 import Display from "./display.js";
 import Inflator from "./inflator.js";
 import Deflator from "./deflator.js";
@@ -157,10 +158,7 @@ export default class RFB extends EventTargetMixin {
         this._qemuAudioSupported = false;
         this._page_had_user_interaction = false;
         this._audio_enable = false;
-        this._audio_next_start = 0;
-        this._audio_sample_rate = 44100;
-        this._audio_channels = 2;
-        this._audio_context = null;
+        this._audio = new Audio(44100, 2);
 
         this._extendedPointerEventSupported = false;
 
@@ -2697,7 +2695,7 @@ export default class RFB extends EventTargetMixin {
 
             case encodings.pseudoEncodingQEMUAudioEvent:
                 if (!this._qemuAudioSupported) {
-                    RFB.messages.enableQemuAudioUpdates(this._sock, this._audio_channels, this._audio_sample_rate);
+                    RFB.messages.enableQemuAudioUpdates(this._sock, this._audio.nchannels, this._audio.sample_rate);
                     this._qemuAudioSupported = true;
                 }
                 return true;
@@ -2739,16 +2737,11 @@ export default class RFB extends EventTargetMixin {
 
         switch (operation) {
             case 0: {
-                this._audio_context = null;
-                this._audio_next_start = 0;
+                this._audio.stop();
                 return true;
             }
             case 1: {
-                this._audio_context = new AudioContext({
-                    latencyHint: "interactive",
-                    sampleRate: this._audio_sample_rate,
-                });
-                this._audio_next_start = 0;
+                this._audio.start();
                 return true;
             }
             case 2: break;
@@ -2764,47 +2757,21 @@ export default class RFB extends EventTargetMixin {
 
         const length = this._sock.rQshift32();
 
+        if (length === 0) {
+            return false;
+        }
+
         if (this._sock.rQwait("audio payload", length, 8)) {
             return false;
         }
 
-        if (length !== 0) {
-            let payload = this._sock.rQshiftBytes(length, false);
+        let payload = this._sock.rQshiftBytes(length, false);
 
-            if (this._audio_context === null) {
-                return false;
-            }
-
-            let sample_bytes = 2*this._audio_channels;
-            let buffer = this._audio_context.createBuffer(this._audio_channels, length/sample_bytes, this._audio_sample_rate);
-
-            for (let ch = 0; ch < this._audio_channels; ch++) {
-                const channel = buffer.getChannelData(ch);
-                let channel_offset = ch*2;
-                for (let i = 0; i < buffer.length; i++) {
-                    let p = i*sample_bytes + channel_offset;
-                    let value = payload[p] + payload[p+1]*256;
-                    channel[i] = (value / 32768.0) - 1.0;
-                }
-            }
-
-            if (this._page_had_user_interaction && this._audio_enable) {
-                let ctime = this._audio_context.currentTime;
-                if (ctime > this._audio_next_start) {
-                    this._audio_next_start = ctime;
-                }
-                let start_time = this._audio_next_start;
-
-                this._audio_next_start += buffer.duration;
-
-                let source = this._audio_context.createBufferSource();
-                source.buffer = buffer;
-                source.connect(this._audio_context.destination);
-                source.start(start_time);
-            }
+        if (!this._page_had_user_interaction || !this._audio_enable) {
+            return true;
         }
 
-        return true;
+        return this._audio.play(payload);
     }
 
     enable_audio(value) {
@@ -2812,7 +2779,7 @@ export default class RFB extends EventTargetMixin {
             this._audio_enable = value;
             if (this._qemuAudioSupported) {
                 if (this._audio_enable) {
-                    RFB.messages.enableQemuAudioUpdates(this._sock, this._audio_channels, this._audio_sample_rate);
+                    RFB.messages.enableQemuAudioUpdates(this._sock, this._audio.nchannels, this._audio.sample_rate);
                 } else {
                     RFB.messages.disableQemuAudioUpdates(this._sock);
                 }
@@ -3433,7 +3400,7 @@ RFB.messages = {
         sock.flush();
     },
 
-    disableQemuAudioUpdates(sock, channels, sample_rate) {
+    disableQemuAudioUpdates(sock, nchannels, sample_rate) {
         sock.sQpush8(255); // msg-type
         sock.sQpush8(1); // submessage-type
         sock.sQpush16(1); // disable audio
@@ -3441,13 +3408,13 @@ RFB.messages = {
         sock.flush();
     },
 
-    enableQemuAudioUpdates(sock, channels, sample_rate) {
+    enableQemuAudioUpdates(sock, nchannels, sample_rate) {
 
         sock.sQpush8(255); // msg-type
         sock.sQpush8(1); // submessage-type
         sock.sQpush16(2); // set sample format
         sock.sQpush8(2); // format U16
-        sock.sQpush8(channels);
+        sock.sQpush8(nchannels);
         sock.sQpush32(sample_rate); // audio frequency
 
         sock.sQpush8(255); // msg-type