// @flow import { createRnnoiseProcessorPromise, getSampleLength } from '../rnnoise/'; import EventEmitter from 'events'; import JitsiMeetJS from '../base/lib-jitsi-meet'; import logger from './logger'; import { VAD_SCORE_PUBLISHED } from './VADEvents'; /** * The structure used by TrackVADEmitter to relay a score */ export type VADScore = { /** * Device ID associated with the VAD score */ deviceId: string, /** * The PCM score from 0 - 1 i.e. 0.60 */ score: number, /** * Epoch time at which PCM was recorded */ timestamp: number }; /** * Connects an audio JitsiLocalTrack to a RnnoiseProcessor using WebAudio ScriptProcessorNode. * Once an object is created audio from the local track flows through the ScriptProcessorNode as raw PCM. * The PCM is processed by the rnnoise module and a VAD (voice activity detection) score is obtained, the * score is published to consumers via an EventEmitter. * After work is done with this service the destroy method needs to be called for a proper cleanup. */ export default class TrackVADEmitter extends EventEmitter { /** * The AudioContext instance. */ _audioContext: AudioContext; /** * The MediaStreamAudioSourceNode instance. */ _audioSource: MediaStreamAudioSourceNode; /** * The ScriptProcessorNode instance. */ _audioProcessingNode: ScriptProcessorNode; /** * Buffer to hold residue PCM resulting after a ScriptProcessorNode callback */ _bufferResidue: Float32Array; /** * State flag, check if the instance was destroyed */ _destroyed: boolean = false; /** * The JitsiLocalTrack instance. */ _localTrack: Object; /** * Device ID of the target microphone. */ _micDeviceId: string; /** * Callback function that will be called by the ScriptProcessNode with raw PCM data, depending on the set sample * rate. */ _onAudioProcess: (audioEvent: Object) => void; /** * Sample rate of the ScriptProcessorNode. */ _procNodeSampleRate: number; /** * Rnnoise adapter that allows us to calculate VAD score for PCM samples */ _rnnoiseProcessor: Object; /** * PCM Sample size expected by the RnnoiseProcessor instance. */ _rnnoiseSampleSize: number; /** * Constructor. * * @param {number} procNodeSampleRate - Sample rate of the ScriptProcessorNode. Possible values 256, 512, 1024, * 2048, 4096, 8192, 16384. Passing other values will default to closes neighbor. * @param {Object} rnnoiseProcessor - Rnnoise adapter that allows us to calculate VAD score * for PCM samples. * @param {Object} jitsiLocalTrack - JitsiLocalTrack corresponding to micDeviceId. */ constructor(procNodeSampleRate: number, rnnoiseProcessor: Object, jitsiLocalTrack: Object) { super(); this._procNodeSampleRate = procNodeSampleRate; this._rnnoiseProcessor = rnnoiseProcessor; this._localTrack = jitsiLocalTrack; this._micDeviceId = jitsiLocalTrack.getDeviceId(); this._bufferResidue = new Float32Array([]); this._audioContext = new AudioContext(); this._rnnoiseSampleSize = getSampleLength(); this._onAudioProcess = this._onAudioProcess.bind(this); this._initializeAudioContext(); this._connectAudioGraph(); logger.log(`Constructed VAD emitter for device: ${this._micDeviceId}`); } /** * Factory method that sets up all the necessary components for the creation of the TrackVADEmitter. * * @param {string} micDeviceId - Target microphone device id. * @param {number} procNodeSampleRate - Sample rate of the proc node. * @returns {Promise} - Promise resolving in a new instance of TrackVADEmitter. */ static async create(micDeviceId: string, procNodeSampleRate: number) { let rnnoiseProcessor = null; let localTrack = null; try { logger.log(`Initializing TrackVADEmitter for device: ${micDeviceId}`); rnnoiseProcessor = await createRnnoiseProcessorPromise(); localTrack = await JitsiMeetJS.createLocalTracks({ devices: [ 'audio' ], micDeviceId }); // We only expect one audio track when specifying a device id. if (!localTrack[0]) { throw new Error(`Failed to create jitsi local track for device id: ${micDeviceId}`); } return new TrackVADEmitter(procNodeSampleRate, rnnoiseProcessor, localTrack[0]); } catch (error) { logger.error(`Failed to create TrackVADEmitter for ${micDeviceId} with error: ${error}`); if (rnnoiseProcessor) { rnnoiseProcessor.destroy(); } if (localTrack) { localTrack.stopStream(); } throw error; } } /** * Sets up the audio graph in the AudioContext. * * @returns {Promise} */ _initializeAudioContext() { this._audioSource = this._audioContext.createMediaStreamSource(this._localTrack.stream); // TODO AudioProcessingNode is deprecated check and replace with alternative. // We don't need stereo for determining the VAD score so we create a single chanel processing node. this._audioProcessingNode = this._audioContext.createScriptProcessor(this._procNodeSampleRate, 1, 1); this._audioProcessingNode.onaudioprocess = this._onAudioProcess; } /** * ScriptProcessorNode callback, the input parameters contains the PCM audio that is then sent to rnnoise. * Rnnoise only accepts PCM samples of 480 bytes whereas the webaudio processor node can't sample at a multiple * of 480 thus after each _onAudioProcess callback there will remain and PCM buffer residue equal * to _procNodeSampleRate / 480 which will be added to the next sample buffer and so on. * * @param {AudioProcessingEvent} audioEvent - Audio event. * @returns {void} */ _onAudioProcess(audioEvent: Object) { // Prepend the residue PCM buffer from the previous process callback. const inData = audioEvent.inputBuffer.getChannelData(0); const completeInData = [ ...this._bufferResidue, ...inData ]; const sampleTimestamp = Date.now(); let i = 0; for (; i + this._rnnoiseSampleSize < completeInData.length; i += this._rnnoiseSampleSize) { const pcmSample = completeInData.slice(i, i + this._rnnoiseSampleSize); const vadScore = this._rnnoiseProcessor.calculateAudioFrameVAD(pcmSample); this.emit(VAD_SCORE_PUBLISHED, { timestamp: sampleTimestamp, score: vadScore, deviceId: this._micDeviceId }); } this._bufferResidue = completeInData.slice(i, completeInData.length); } /** * Connects the nodes in the AudioContext to start the flow of audio data. * * @returns {void} */ _connectAudioGraph() { this._audioSource.connect(this._audioProcessingNode); this._audioProcessingNode.connect(this._audioContext.destination); } /** * Disconnects the nodes in the AudioContext. * * @returns {void} */ _disconnectAudioGraph() { // Even thought we disconnect the processing node it seems that some callbacks remain queued, // resulting in calls with and uninitialized context. // eslint-disable-next-line no-empty-function this._audioProcessingNode.onaudioprocess = () => {}; this._audioProcessingNode.disconnect(); this._audioSource.disconnect(); } /** * Cleanup potentially acquired resources. * * @returns {void} */ _cleanupResources() { logger.debug(`Cleaning up resources for device ${this._micDeviceId}!`); this._disconnectAudioGraph(); this._localTrack.stopStream(); this._rnnoiseProcessor.destroy(); } /** * Destroy TrackVADEmitter instance (release resources and stop callbacks). * * @returns {void} */ destroy() { if (this._destroyed) { return; } logger.log(`Destroying TrackVADEmitter for mic: ${this._micDeviceId}`); this._cleanupResources(); this._destroyed = true; } }