audio: implement re-sync, move code to separate file

Signed-off-by: Dietmar Maurer <dietmar@proxmox.com>
2025-04-04 11:28:37 +02:00 · 2025-04-04 11:28:37 +02:00 · 3721da90dc
parent 6f8f4bff62
commit 3721da90dc
2 changed files with 187 additions and 50 deletions
--- a/core/audio.js
+++ b/core/audio.js
@ -0,0 +1,170 @@
 // The RFB protocol (VNC) is designed for real-time user interactions
 // and allows transferring audio messages together with screen content.
 // It is not possible to use any kind of buffering, because that would
 // introduce large delays between user interaction and content display.
 //
 // This is not really a problem with screen content, because the human
 // brain is quite tolerate about slight speed changes in video content,
 // and we mostly transfer non-video data anyways.
 //
 // With audio, the situation is quite different, as it must be played
 // at a constant speed. Any delay leads to audio distortion, which is
 // unpleasant for humans.
 //
 // Without buffering, it is always possible for audio frames to arrive
 // too late or too early due to changing network speeds.
 //
 // We use the following algorithm:
 //
 // - small Jitter buffer to tolerate small speed changes (20ms)
 // - simply discard late audio frame
 // - Queue early frames with slight speedup (pitch scale) to re-sync audio
 // - if we get to many early frames, skip frames for fast re-sync
 //
 // ## Audio format
 //
 // We use/expect U16 raw audio data.
 import * as Log from './util/logging.js';
 export default class Audio {
    constructor(sample_rate, nchannels) {
        this._next_start = 0;
        this._context = null;
        this._jitter = 0.02;
        this._resample_trigger = 5*this._jitter;
        this._stable_time = 1.0;
        // ===== PROPERTIES =====
        this._sample_rate = sample_rate;
        this._nchannels = nchannels;
    }
    // ===== PROPERTIES =====
    get sample_rate() { return this._sample_rate; }
    get nchannels() { return this._nchannels; }
    // ===== PUBLIC METHODS =====
    // Stop audio playback
    //
    // Further audio frames are simply dropped.
    stop() {
        this._context = null;
        this._next_start = 0;
    }
    start() {
        this._context = new AudioContext({
            latencyHint: "interactive",
            sampleRate: this._sample_rate,
        });
        this._next_start = 0;
    }
    play(payload) {
        if (this._context === null) {
            return true;
        }
        let ctime = this._context.currentTime;
        let time_offset = this._next_start - ctime;
        let sample_bytes = 2*this._nchannels;
        if ((time_offset < this._jitter) && (this._resample_trigger !== 5*this._jitter)) {
            Log.Debug("Stop resampling because audio is in sync (delay = " + time_offset + " sec)");
            this._resample_trigger = 5*this._jitter;
        }
        let buffer = null;
        if (time_offset > this._resample_trigger && (payload.length > (100*sample_bytes))) {
            if (this._resample_trigger !== this._jitter) {
                Log.Debug("Start resampling to re-sync audio (delay = " + time_offset + " sec)");
                this._resample_trigger = this._jitter;
            }
            buffer = this._pitchScale(payload, 1.01); // increase pitch by 1%
        } else {
            buffer = this._createBuffer(payload);
        }
        if (this._next_start > 0) {
            if (time_offset < -buffer.duration) {
                Log.Warn("Skip delayed audio frame (delay = " + (-time_offset) + " sec)");
                this._next_start = ctime + this._jitter;
                return true; // do not play delayed frame - skip it!
            }
            if  (time_offset > 0.5) {
                Log.Warn("Move fast audio frame (offset = " + time_offset + " sec)");
                this._stable_time = 0;
                return true; // skip frame.
            }
        }
        this._stable_time += buffer.duration;
        if (this._next_start === 0) {
            this._next_start = ctime + this._jitter;
        }
        let start_time = this._next_start;
        this._next_start += buffer.duration;
        if (this._stable_time >= 1.0) {
            let source = this._context.createBufferSource();
            source.buffer = buffer;
            source.connect(this._context.destination);
            source.start(start_time);
        }
        return true;
    }
    // ===== PRIVATE METHODS =====
    // see: https://en.wikipedia.org/wiki/Audio_time_stretching_and_pitch_scaling
    _pitchScale(payload, factor) {
        let sample_bytes = 2*this._nchannels;
        let new_length = Math.ceil(payload.length/(factor*sample_bytes));
        let buffer = this._context.createBuffer(this._nchannels, new_length, this._sample_rate);
        for (let ch = 0; ch < this._nchannels; ch++) {
            const channel = buffer.getChannelData(ch);
            let channel_offset = ch*2;
            for (let i = 0; i < buffer.length; i++) {
                let pos_float = i*factor;
                let j = Math.trunc(pos_float);
                let second_weight = pos_float % 1;
                let first_weight = 1 - second_weight;
                let p = j*sample_bytes + channel_offset;
                let value0 = payload[p] + payload[p+1]*256;
                p += sample_bytes;
                let value1 = value0;
                if (p < payload.length) {
                    value1 = payload[p] + payload[p+1]*256;
                }
                let value = (value0*first_weight + value1*second_weight);
                channel[i] = (value / 32768.0) - 1.0;
            }
        }
        return buffer;
    }
    _createBuffer(payload) {
        let sample_bytes = 2*this._nchannels;
        let buffer = this._context.createBuffer(
            this._nchannels, payload.length/sample_bytes, this._sample_rate);
        for (let ch = 0; ch < this._nchannels; ch++) {
            const channel = buffer.getChannelData(ch);
            let channel_offset = ch*2;
            for (let i = 0; i < buffer.length; i++) {
                let p = i*sample_bytes + channel_offset;
                let value = payload[p] + payload[p+1]*256;
                channel[i] = (value / 32768.0) - 1.0;
            }
        }
        return buffer;
    }
 }
--- a/core/rfb.js
+++ b/core/rfb.js
@ -14,6 +14,7 @@ import { dragThreshold, supportsWebCodecsH264Decode } from './util/browser.js';
 import { clientToElement } from './util/element.js';
 import { setCapture } from './util/events.js';
 import EventTargetMixin from './util/eventtarget.js';
 import Audio from "./audio.js";
 import Display from "./display.js";
 import Inflator from "./inflator.js";
 import Deflator from "./deflator.js";
@ -157,10 +158,7 @@ export default class RFB extends EventTargetMixin {
        this._qemuAudioSupported = false;
        this._page_had_user_interaction = false;
        this._audio_enable = false;
-        this._audio_next_start = 0;
+        this._audio = new Audio(44100, 2);
        this._audio_sample_rate = 44100;
        this._audio_channels = 2;
        this._audio_context = null;
        this._extendedPointerEventSupported = false;
@ -2697,7 +2695,7 @@ export default class RFB extends EventTargetMixin {
            case encodings.pseudoEncodingQEMUAudioEvent:
                if (!this._qemuAudioSupported) {
-                    RFB.messages.enableQemuAudioUpdates(this._sock, this._audio_channels, this._audio_sample_rate);
+                    RFB.messages.enableQemuAudioUpdates(this._sock, this._audio.nchannels, this._audio.sample_rate);
                    this._qemuAudioSupported = true;
                }
                return true;
@ -2739,16 +2737,11 @@ export default class RFB extends EventTargetMixin {
        switch (operation) {
            case 0: {
-                this._audio_context = null;
+                this._audio.stop();
                this._audio_next_start = 0;
                return true;
            }
            case 1: {
-                this._audio_context = new AudioContext({
+                this._audio.start();
                    latencyHint: "interactive",
                    sampleRate: this._audio_sample_rate,
                });
                this._audio_next_start = 0;
                return true;
            }
            case 2: break;
@ -2764,47 +2757,21 @@ export default class RFB extends EventTargetMixin {
        const length = this._sock.rQshift32();
        if (length === 0) {
            return false;
        }
        if (this._sock.rQwait("audio payload", length, 8)) {
            return false;
        }
-        if (length !== 0) {
+        let payload = this._sock.rQshiftBytes(length, false);
            let payload = this._sock.rQshiftBytes(length, false);
-            if (this._audio_context === null) {
+        if (!this._page_had_user_interaction || !this._audio_enable) {
-                return false;
+            return true;
            }
            let sample_bytes = 2*this._audio_channels;
            let buffer = this._audio_context.createBuffer(this._audio_channels, length/sample_bytes, this._audio_sample_rate);
            for (let ch = 0; ch < this._audio_channels; ch++) {
                const channel = buffer.getChannelData(ch);
                let channel_offset = ch*2;
                for (let i = 0; i < buffer.length; i++) {
                    let p = i*sample_bytes + channel_offset;
                    let value = payload[p] + payload[p+1]*256;
                    channel[i] = (value / 32768.0) - 1.0;
                }
            }
            if (this._page_had_user_interaction && this._audio_enable) {
                let ctime = this._audio_context.currentTime;
                if (ctime > this._audio_next_start) {
                    this._audio_next_start = ctime;
                }
                let start_time = this._audio_next_start;
                this._audio_next_start += buffer.duration;
                let source = this._audio_context.createBufferSource();
                source.buffer = buffer;
                source.connect(this._audio_context.destination);
                source.start(start_time);
            }
        }
-        return true;
+        return this._audio.play(payload);
    }
    enable_audio(value) {
@ -2812,7 +2779,7 @@ export default class RFB extends EventTargetMixin {
            this._audio_enable = value;
            if (this._qemuAudioSupported) {
                if (this._audio_enable) {
-                    RFB.messages.enableQemuAudioUpdates(this._sock, this._audio_channels, this._audio_sample_rate);
+                    RFB.messages.enableQemuAudioUpdates(this._sock, this._audio.nchannels, this._audio.sample_rate);
                } else {
                    RFB.messages.disableQemuAudioUpdates(this._sock);
                }
@ -3433,7 +3400,7 @@ RFB.messages = {
        sock.flush();
    },
-    disableQemuAudioUpdates(sock, channels, sample_rate) {
+    disableQemuAudioUpdates(sock, nchannels, sample_rate) {
        sock.sQpush8(255); // msg-type
        sock.sQpush8(1); // submessage-type
        sock.sQpush16(1); // disable audio
@ -3441,13 +3408,13 @@ RFB.messages = {
        sock.flush();
    },
-    enableQemuAudioUpdates(sock, channels, sample_rate) {
+    enableQemuAudioUpdates(sock, nchannels, sample_rate) {
        sock.sQpush8(255); // msg-type
        sock.sQpush8(1); // submessage-type
        sock.sQpush16(2); // set sample format
        sock.sQpush8(2); // format U16
-        sock.sQpush8(channels);
+        sock.sQpush8(nchannels);
        sock.sQpush32(sample_rate); // audio frequency
        sock.sQpush8(255); // msg-type