====== Automatic Speech Recognition (ASR) with vosk ======
===== Sources =====
==== vosk ====
* https://alphacephei.com/vosk/
* https://github.com/alphacep/vosk-api
==== Dataquest ====
* https://github.com/dataquestio/project-walkthroughs/blob/master/microphone/microphone.ipynb
{{youtube>2kSPbH4jWME?}}
\\
===== Installation =====
conda create -n vosk python=3.9
conda activate vosk
conda install -c conda-forge jupyterlab numpy matplotlib pandas
#conda install -c conda-forge ipywidgets
#conda install -c conda-forge scipy scikit-learn
pip install vosk
pip install pyaudio
On Windows the vosk models are **cached here**: ''C:\Users\\.cache\vosk''
==== ffmpeg ====
* https://www.gyan.dev/ffmpeg/builds/
===== pyaudio: Find the right audio device index of your favorite microphone ====
import pyaudio
import wave
# Constants for audio recording
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
RECORD_SECONDS = 5 # Adjust this to change the duration of the recording
OUTPUT_FILENAME = "output.wav"
def list_audio_devices():
audio = pyaudio.PyAudio()
devices = []
for i in range(audio.get_device_count()):
device_info = audio.get_device_info_by_index(i)
devices.append(f"{i}: {device_info['name']}")
audio.terminate()
return devices
def get_input_device_index():
devices = list_audio_devices()
print("Available audio input devices:")
for device in devices:
print(device)
while True:
try:
print("")
print("On Becker's Dell Lat. 7330 the following works:")
print("1: Microphone Array (Realtek(R) Au")
print("")
device_index = int(input("Enter the index of the desired input device: "))
if 0 <= device_index < len(devices):
return device_index
else:
print("Invalid input. Please enter a valid device index.")
except ValueError:
print("Invalid input. Please enter a valid device index.")
def record_audio(device_index):
audio = pyaudio.PyAudio()
# Open a microphone stream with the selected input device
stream = audio.open(format=FORMAT, channels=CHANNELS,
rate=RATE, input=True, input_device_index=device_index,
frames_per_buffer=CHUNK)
print(f"Recording from: {audio.get_device_info_by_index(device_index)['name']}")
frames = []
# Record audio in chunks and store it in frames
for _ in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
print("Finished recording.")
# Stop and close the microphone stream
stream.stop_stream()
stream.close()
audio.terminate()
# Save the recorded audio to a WAV file
with wave.open(OUTPUT_FILENAME, 'wb') as wf:
wf.setnchannels(CHANNELS)
wf.setsampwidth(audio.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
if __name__ == "__main__":
device_index = get_input_device_index()
record_audio(device_index)