Real-time transcription Google Cloud Speech API with gRPC from Electron - javascript

What I want to achieve is the same real-time transcript process as Web Speech API but using Google Cloud Speech API.
The main goal is to transcribe live recording through an Electron app with Speech API using gRPC protocol.
This is a simplified version of what I implemented:
const { desktopCapturer } = window.require('electron');
const speech = require('#google-cloud/speech');
const client = speech.v1({
projectId: 'my_project_id',
credentials: {
client_email: 'my_client_email',
private_key: 'my_private_key',
},
});
desktopCapturer.getSources({ types: ['window', 'screen'] }, (error, sources) => {
navigator.mediaDevices
.getUserMedia({
audio: true,
})
.then((stream) => {
let fileReader = new FileReader();
let arrayBuffer;
fileReader.onloadend = () => {
arrayBuffer = fileReader.result;
let speechStreaming = client
.streamingRecognize({
config: {
encoding: speech.v1.types.RecognitionConfig.AudioEncoding.LINEAR16,
languageCode: 'en-US',
sampleRateHertz: 44100,
},
singleUtterance: true,
})
.on('data', (response) => response);
speechStreaming.write(arrayBuffer);
};
fileReader.readAsArrayBuffer(stream);
});
});
The error response from Speech API is that the audio stream is too slow and we are not sending it in real-time.
I feel that the reason is that I passed the stream without any formatting or object initialization so the streaming recognition cannot be performed.

This official sample project on Github appears to match what you're looking for: https://github.com/googleapis/nodejs-speech/blob/master/samples/infiniteStreaming.js
This application demonstrates how to perform infinite streaming using the streamingRecognize operation with the Google Cloud Speech API.
See also my comment for an alternative in Electron, using OtterAI's transcription service. (it's the approach I'm going to try soon)

You may use node-record-lpcm16 module to record audio and pipe directly to a speech recognition system like Google.
In the repository, there is an example using wit.ai.
For Google Speech recognition, you may use something like that:
'use strict'
const { SpeechClient } = require('#google-cloud/speech')
const recorder = require('node-record-lpcm16')
const RECORD_CONFIG = {
sampleRate: 44100,
recorder: 'arecord'
}
const RECOGNITION_CONFIG = {
config: {
sampleRateHertz: 44100,
language: 'en-US',
encoding: 'LINEAR16'
},
interimResults: true
}
const client = new SpeechClient(/* YOUR CREDENTIALS */)
const recognize = () => {
client
.streamingRecognize(RECOGNITION_CONFIG)
.on('error', err => {
console.error('Error during recognition: ', err)
})
.once('writing', data => {
console.log('Recognition started!')
}
.on('data', data => {
console.log('Received recognition data: ', data)
}
}
const recording = recorder.record(RECORD_CONFIG)
recording
.stream()
.on('error', err => {
console.error('Error during recognition: ', err)
.pipe(recognize)

Related

How to record aux-in audio using Electron?

In an Electron application that I'm working on, I'm trying to record the audio that I get through my auxillary input (3.5mm jack) in my computer. I don't want to pick up any audio through the built-in microphone.
I was running the following code in order to record five seconds of audio. It works fine, but instead of recording the aux-in, it records the sound from the microphone.
const fs = require("fs");
function fiveSecondAudioRecorder() {
navigator.mediaDevices.getUserMedia({ audio: true })
.then(stream => { const mediaRecorder = new MediaRecorder(stream);
mediaRecorder.start();
const chunks = [];
mediaRecorder.addEventListener("dataavailable", event => {
chunks.push(event.data);
});
mediaRecorder.addEventListener("stop", () => {
const blob = new Blob(chunks, { type: "audio/ogg" });
blob.arrayBuffer().then(arrayBuffer => {
const buffer = new Buffer.from(arrayBuffer);
fs.writeFile("recorded-audio.ogg", buffer, error => {
if (error) {
console.error("Failed to save recorded audio to disk", error);
} else {
console.log("Recorded audio was saved to disk");
}
});
});
});
setTimeout(() => {
mediaRecorder.stop();
}, 5000);
})
.catch(error => {
console.error("Failed to access microphone", error);
});
}
fiveSecondAudioRecorder()
I plugged in a 3.5mm jack with an active audio source and expected this to automatically switch the audio input. However, this is not the case and I only manage to pick up the sounds from my microphone. Hence my question: How can I specifically record my aux-in?
Thanks a lot for your help.

How can I convert opus packets to mp3/wav

I created a discord bot with discord.js v13, I get trouble with converting the opus packet to other file types, even the discord.js official examples haven't updated for discord.js v13, I got no idea to deal with it, here is part of my code
async function record(interaction, opts = {}) {
//get voice connection, if there isn't one, create one
let connection = getVoiceConnection(interaction.guildId);
if (!connection) {
if (!interaction.member.voice.channel) return false;
connection = joinVoice(interaction.member.voice.channel, interaction)
}
const memberId = interaction.member.id;
//create the stream
const stream = connection.receiver.subscribe(memberId, {
end: {
behavior: EndBehaviorType.Manual
}
});
//create the file stream
const writableStream = fs.createWriteStream(`${opts.filename || interaction.guild.name}.${opts.format || 'opus'}`);
console.log('Created the streams, started recording');
//todo: set the stream into client and stop it in another function
return setTimeout(() => {
console.log('Creating the decoder')
let decoder = new prism.opus.Decoder();
console.log('Created');
stream.destroy();
console.log('Stopped recording and saving the stream');
stream
.pipe(writableStream)
stream.on('close', () => {
console.log('Data Stream closed')
});
stream.on('error', (e) => {
console.error(e)
});
}, 5000);
}
Try setting frameSize, channels and rate for the Decoder:
const opusDecoder = new prism.opus.Decoder({
frameSize: 960,
channels: 2,
rate: 48000,
})
Also not sure if it is intended, but you seem to destroy the stream just before you pipe it into writable stream.
Here is my example that gives stereo 48kHz signed 16-bit PCM stream:
const writeStream = fs.createWriteStream('samples/output.pcm')
const listenStream = connection.receiver.subscribe(userId)
const opusDecoder = new prism.opus.Decoder({
frameSize: 960,
channels: 2,
rate: 48000,
})
listenStream.pipe(opusDecoder).pipe(writeStream)
You can then use Audacity to play the PCM file. Use File -> Import -> Raw Data...

Adding screen share option webrtc app. unable to get on other users

i was adding screen share functionality to my app but its is not working .. its only show screen share on my side but not on other user.
here is code :
try {
navigator.mediaDevices
.getDisplayMedia({
video: true,
audio: true
})
.then((stream) => {
const video1 = document.createElement("video");
video1.controls = true;
addVideoStream(video1, stream);
socket.on("user-connected", (userId) => {
const call = peer.call(userId, stream);
stream.getVideoTracks()[0].addEventListener("ended", () => {
video1.remove();
});
call.on("close", () => {});
});
stream.getVideoTracks()[0].addEventListener("ended", () => {
video1.remove();
});
});
} catch (err) {
console.log("Error: " + err);
}
Issue could be related to signaling and that depends on each project.
You could start from a working example that streams webcam/microphone and then switch source to screen.
In this HTML5 Live Streaming example, you can switch source between camera and desktop - transmission is the same. So you could achive somethign similar by starting from an example for camera streaming and testing that first.

How to generate an ICE candidate?

I'm developing a video conference with WebRTC in a local network, so I use only one signaling server to exchage SDP data. As I understand, I also need to exchange ICE candidates, but I don't know how to generate them. Thanks.
You can get the generated iceCandidate by setting the peerConnection.onicecandidate event.
(async () => {
const pc = new RTCPeerConnection();
pc.onicecandidate = evt => {
console.log(evt.candidate?.candidate);
};
const stream = await navigator.mediaDevices.getUserMedia({video:true});
stream.getTracks().forEach(track => pc.addTrack(track, stream));
const offer = await pc.createOffer();
await pc.setLocalDescription(offer);
})();
Do you create a configuration which contains at least one ice server url, and then use this configuration to create your RTCPeerConnection instance? When you set an ice server url, the 'icecandidate' event should be fired.
const configuration = {
iceServers: [{ urls: "stun:stun.l.google.com:19302" }],
};
const pc = new RTCPeerConnection(configuration);
pc.addEventListener('icecandidate', event => {
if (event.candidate) {
console.log('icecandidate received: ', event.candidate);
}
});

Audio stream from cordova-plugin-audioinput to Google Speech API

For a cross-platform app project using Meteor framework, I'd like to record microphone inputs and extract speech thanks to Google Speech API
Following Google documentation, I'm more specifically trying to build an audio stream to feed the Google Speech client.
On client side, a recording button triggers the following startCapture function (based on cordova audioinput plugin):
export var startCapture = function () {
try {
if (window.audioinput && !audioinput.isCapturing()) {
setTimeout(stopCapture, 20000);
var captureCfg = {
sampleRate: 16000,
bufferSize: 2048,
}
audioinput.start(captureCfg);
}
}
catch (e) {
}
}
audioinput events allow me to get chunks of audio data as it is recorded:
window.addEventListener('audioinput', onAudioInputCapture, false);
var audioDataQueue = [];
function onAudioInputCapture(evt) {
try {
if (evt && evt.data) {
// Push the data to the audio queue (array)
audioDataQueue.push(evt.data);
// Here should probably be a call to a Meteor server method?
}
}
catch (e) {
}
}
I'm struggling to convert the recorded audio data to some ReadableStream, that I would pipe to Google Speech API client on server side.
const speech = require('#google-cloud/speech');
const client = new speech.SpeechClient();
const request = {
config: {
encoding: "LINEAR16",
sampleRateHertz: 16000,
languageCode: 'en-US',
},
interimResults: true,
};
export const recognizeStream = client
.streamingRecognize(request)
.on('error', console.error)
.on('data', data =>
console.log(data.results)
);
I tried the following approach, but it doesn't feel like the right way to proceed:
const Stream = require('stream')
var serverAudioDataQueue = [];
const readable = new Stream.Readable({
objectMode: true,
});
readable._read = function(n){
this.push(audioDataQueue.splice(0, audioDataQueue.length))
}
readable.pipe(recognizeStream);
Meteor.methods({
'feedGoogleSpeech': function(data){
data.forEach(item=>serverAudioDataQueue.push(item));
},
...
});
Any insight on this?

Categories