From 3721da90dccde88f472a568e6b931558e146e679 Mon Sep 17 00:00:00 2001 From: Dietmar Maurer Date: Fri, 4 Apr 2025 11:28:37 +0200 Subject: [PATCH] audio: implement re-sync, move code to separate file Signed-off-by: Dietmar Maurer --- core/audio.js | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++ core/rfb.js | 67 +++++--------------- 2 files changed, 187 insertions(+), 50 deletions(-) create mode 100644 core/audio.js diff --git a/core/audio.js b/core/audio.js new file mode 100644 index 00000000..78992caa --- /dev/null +++ b/core/audio.js @@ -0,0 +1,170 @@ +// The RFB protocol (VNC) is designed for real-time user interactions +// and allows transferring audio messages together with screen content. +// It is not possible to use any kind of buffering, because that would +// introduce large delays between user interaction and content display. +// +// This is not really a problem with screen content, because the human +// brain is quite tolerate about slight speed changes in video content, +// and we mostly transfer non-video data anyways. +// +// With audio, the situation is quite different, as it must be played +// at a constant speed. Any delay leads to audio distortion, which is +// unpleasant for humans. +// +// Without buffering, it is always possible for audio frames to arrive +// too late or too early due to changing network speeds. +// +// We use the following algorithm: +// +// - small Jitter buffer to tolerate small speed changes (20ms) +// - simply discard late audio frame +// - Queue early frames with slight speedup (pitch scale) to re-sync audio +// - if we get to many early frames, skip frames for fast re-sync +// +// ## Audio format +// +// We use/expect U16 raw audio data. + +import * as Log from './util/logging.js'; + +export default class Audio { + constructor(sample_rate, nchannels) { + this._next_start = 0; + this._context = null; + this._jitter = 0.02; + this._resample_trigger = 5*this._jitter; + this._stable_time = 1.0; + + // ===== PROPERTIES ===== + this._sample_rate = sample_rate; + this._nchannels = nchannels; + } + + // ===== PROPERTIES ===== + get sample_rate() { return this._sample_rate; } + get nchannels() { return this._nchannels; } + + // ===== PUBLIC METHODS ===== + + // Stop audio playback + // + // Further audio frames are simply dropped. + stop() { + this._context = null; + this._next_start = 0; + } + + start() { + this._context = new AudioContext({ + latencyHint: "interactive", + sampleRate: this._sample_rate, + }); + this._next_start = 0; + } + + play(payload) { + if (this._context === null) { + return true; + } + + let ctime = this._context.currentTime; + + let time_offset = this._next_start - ctime; + + let sample_bytes = 2*this._nchannels; + + if ((time_offset < this._jitter) && (this._resample_trigger !== 5*this._jitter)) { + Log.Debug("Stop resampling because audio is in sync (delay = " + time_offset + " sec)"); + this._resample_trigger = 5*this._jitter; + } + + let buffer = null; + if (time_offset > this._resample_trigger && (payload.length > (100*sample_bytes))) { + if (this._resample_trigger !== this._jitter) { + Log.Debug("Start resampling to re-sync audio (delay = " + time_offset + " sec)"); + this._resample_trigger = this._jitter; + } + buffer = this._pitchScale(payload, 1.01); // increase pitch by 1% + } else { + buffer = this._createBuffer(payload); + } + + if (this._next_start > 0) { + if (time_offset < -buffer.duration) { + Log.Warn("Skip delayed audio frame (delay = " + (-time_offset) + " sec)"); + this._next_start = ctime + this._jitter; + return true; // do not play delayed frame - skip it! + } + if (time_offset > 0.5) { + Log.Warn("Move fast audio frame (offset = " + time_offset + " sec)"); + this._stable_time = 0; + return true; // skip frame. + } + } + + this._stable_time += buffer.duration; + + if (this._next_start === 0) { + this._next_start = ctime + this._jitter; + } + + let start_time = this._next_start; + this._next_start += buffer.duration; + + if (this._stable_time >= 1.0) { + let source = this._context.createBufferSource(); + source.buffer = buffer; + source.connect(this._context.destination); + source.start(start_time); + } + + return true; + } + + // ===== PRIVATE METHODS ===== + + // see: https://en.wikipedia.org/wiki/Audio_time_stretching_and_pitch_scaling + _pitchScale(payload, factor) { + let sample_bytes = 2*this._nchannels; + let new_length = Math.ceil(payload.length/(factor*sample_bytes)); + + let buffer = this._context.createBuffer(this._nchannels, new_length, this._sample_rate); + for (let ch = 0; ch < this._nchannels; ch++) { + const channel = buffer.getChannelData(ch); + let channel_offset = ch*2; + for (let i = 0; i < buffer.length; i++) { + let pos_float = i*factor; + let j = Math.trunc(pos_float); + let second_weight = pos_float % 1; + let first_weight = 1 - second_weight; + let p = j*sample_bytes + channel_offset; + let value0 = payload[p] + payload[p+1]*256; + p += sample_bytes; + let value1 = value0; + if (p < payload.length) { + value1 = payload[p] + payload[p+1]*256; + } + let value = (value0*first_weight + value1*second_weight); + channel[i] = (value / 32768.0) - 1.0; + } + } + return buffer; + } + + _createBuffer(payload) { + let sample_bytes = 2*this._nchannels; + let buffer = this._context.createBuffer( + this._nchannels, payload.length/sample_bytes, this._sample_rate); + + for (let ch = 0; ch < this._nchannels; ch++) { + const channel = buffer.getChannelData(ch); + let channel_offset = ch*2; + for (let i = 0; i < buffer.length; i++) { + let p = i*sample_bytes + channel_offset; + let value = payload[p] + payload[p+1]*256; + channel[i] = (value / 32768.0) - 1.0; + } + } + return buffer; + } +} diff --git a/core/rfb.js b/core/rfb.js index 4f5a51d2..5368e52d 100644 --- a/core/rfb.js +++ b/core/rfb.js @@ -14,6 +14,7 @@ import { dragThreshold, supportsWebCodecsH264Decode } from './util/browser.js'; import { clientToElement } from './util/element.js'; import { setCapture } from './util/events.js'; import EventTargetMixin from './util/eventtarget.js'; +import Audio from "./audio.js"; import Display from "./display.js"; import Inflator from "./inflator.js"; import Deflator from "./deflator.js"; @@ -157,10 +158,7 @@ export default class RFB extends EventTargetMixin { this._qemuAudioSupported = false; this._page_had_user_interaction = false; this._audio_enable = false; - this._audio_next_start = 0; - this._audio_sample_rate = 44100; - this._audio_channels = 2; - this._audio_context = null; + this._audio = new Audio(44100, 2); this._extendedPointerEventSupported = false; @@ -2697,7 +2695,7 @@ export default class RFB extends EventTargetMixin { case encodings.pseudoEncodingQEMUAudioEvent: if (!this._qemuAudioSupported) { - RFB.messages.enableQemuAudioUpdates(this._sock, this._audio_channels, this._audio_sample_rate); + RFB.messages.enableQemuAudioUpdates(this._sock, this._audio.nchannels, this._audio.sample_rate); this._qemuAudioSupported = true; } return true; @@ -2739,16 +2737,11 @@ export default class RFB extends EventTargetMixin { switch (operation) { case 0: { - this._audio_context = null; - this._audio_next_start = 0; + this._audio.stop(); return true; } case 1: { - this._audio_context = new AudioContext({ - latencyHint: "interactive", - sampleRate: this._audio_sample_rate, - }); - this._audio_next_start = 0; + this._audio.start(); return true; } case 2: break; @@ -2764,47 +2757,21 @@ export default class RFB extends EventTargetMixin { const length = this._sock.rQshift32(); + if (length === 0) { + return false; + } + if (this._sock.rQwait("audio payload", length, 8)) { return false; } - if (length !== 0) { - let payload = this._sock.rQshiftBytes(length, false); + let payload = this._sock.rQshiftBytes(length, false); - if (this._audio_context === null) { - return false; - } - - let sample_bytes = 2*this._audio_channels; - let buffer = this._audio_context.createBuffer(this._audio_channels, length/sample_bytes, this._audio_sample_rate); - - for (let ch = 0; ch < this._audio_channels; ch++) { - const channel = buffer.getChannelData(ch); - let channel_offset = ch*2; - for (let i = 0; i < buffer.length; i++) { - let p = i*sample_bytes + channel_offset; - let value = payload[p] + payload[p+1]*256; - channel[i] = (value / 32768.0) - 1.0; - } - } - - if (this._page_had_user_interaction && this._audio_enable) { - let ctime = this._audio_context.currentTime; - if (ctime > this._audio_next_start) { - this._audio_next_start = ctime; - } - let start_time = this._audio_next_start; - - this._audio_next_start += buffer.duration; - - let source = this._audio_context.createBufferSource(); - source.buffer = buffer; - source.connect(this._audio_context.destination); - source.start(start_time); - } + if (!this._page_had_user_interaction || !this._audio_enable) { + return true; } - return true; + return this._audio.play(payload); } enable_audio(value) { @@ -2812,7 +2779,7 @@ export default class RFB extends EventTargetMixin { this._audio_enable = value; if (this._qemuAudioSupported) { if (this._audio_enable) { - RFB.messages.enableQemuAudioUpdates(this._sock, this._audio_channels, this._audio_sample_rate); + RFB.messages.enableQemuAudioUpdates(this._sock, this._audio.nchannels, this._audio.sample_rate); } else { RFB.messages.disableQemuAudioUpdates(this._sock); } @@ -3433,7 +3400,7 @@ RFB.messages = { sock.flush(); }, - disableQemuAudioUpdates(sock, channels, sample_rate) { + disableQemuAudioUpdates(sock, nchannels, sample_rate) { sock.sQpush8(255); // msg-type sock.sQpush8(1); // submessage-type sock.sQpush16(1); // disable audio @@ -3441,13 +3408,13 @@ RFB.messages = { sock.flush(); }, - enableQemuAudioUpdates(sock, channels, sample_rate) { + enableQemuAudioUpdates(sock, nchannels, sample_rate) { sock.sQpush8(255); // msg-type sock.sQpush8(1); // submessage-type sock.sQpush16(2); // set sample format sock.sQpush8(2); // format U16 - sock.sQpush8(channels); + sock.sQpush8(nchannels); sock.sQpush32(sample_rate); // audio frequency sock.sQpush8(255); // msg-type