user:rolf001:vosk:start
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| user:rolf001:vosk:start [2023/09/10 16:01] – removed - external edit (Unknown date) 127.0.0.1 | user:rolf001:vosk:start [2023/09/10 16:42] (current) – [pyaudio: Find the right audio device index] rolf.becker | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| + | ====== Automatic Speech Recognition (ASR) with vosk ====== | ||
| + | |||
| + | ===== Sources ===== | ||
| + | |||
| + | ==== vosk ==== | ||
| + | |||
| + | * https:// | ||
| + | * https:// | ||
| + | |||
| + | ==== Dataquest ==== | ||
| + | |||
| + | * https:// | ||
| + | |||
| + | {{youtube> | ||
| + | \\ | ||
| + | |||
| + | |||
| + | ===== Installation ===== | ||
| + | |||
| + | < | ||
| + | conda create -n vosk python=3.9 | ||
| + | conda activate vosk | ||
| + | conda install -c conda-forge jupyterlab numpy matplotlib pandas | ||
| + | #conda install -c conda-forge ipywidgets | ||
| + | #conda install -c conda-forge scipy scikit-learn | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | pip install vosk | ||
| + | pip install pyaudio | ||
| + | </ | ||
| + | |||
| + | On Windows the vosk models are **cached here**: '' | ||
| + | |||
| + | ==== ffmpeg ==== | ||
| + | |||
| + | * https:// | ||
| + | |||
| + | |||
| + | ===== pyaudio: Find the right audio device index of your favorite microphone ==== | ||
| + | |||
| + | <code python> | ||
| + | |||
| + | import pyaudio | ||
| + | import wave | ||
| + | |||
| + | # Constants for audio recording | ||
| + | FORMAT = pyaudio.paInt16 | ||
| + | CHANNELS = 1 | ||
| + | RATE = 44100 | ||
| + | CHUNK = 1024 | ||
| + | RECORD_SECONDS = 5 # Adjust this to change the duration of the recording | ||
| + | OUTPUT_FILENAME = " | ||
| + | |||
| + | def list_audio_devices(): | ||
| + | audio = pyaudio.PyAudio() | ||
| + | devices = [] | ||
| + | | ||
| + | for i in range(audio.get_device_count()): | ||
| + | device_info = audio.get_device_info_by_index(i) | ||
| + | devices.append(f" | ||
| + | |||
| + | audio.terminate() | ||
| + | return devices | ||
| + | |||
| + | def get_input_device_index(): | ||
| + | devices = list_audio_devices() | ||
| + | | ||
| + | print(" | ||
| + | for device in devices: | ||
| + | print(device) | ||
| + | |||
| + | while True: | ||
| + | try: | ||
| + | print("" | ||
| + | print(" | ||
| + | print(" | ||
| + | print("" | ||
| + | device_index = int(input(" | ||
| + | if 0 <= device_index < len(devices): | ||
| + | return device_index | ||
| + | else: | ||
| + | print(" | ||
| + | except ValueError: | ||
| + | print(" | ||
| + | |||
| + | def record_audio(device_index): | ||
| + | audio = pyaudio.PyAudio() | ||
| + | |||
| + | # Open a microphone stream with the selected input device | ||
| + | stream = audio.open(format=FORMAT, | ||
| + | rate=RATE, input=True, input_device_index=device_index, | ||
| + | frames_per_buffer=CHUNK) | ||
| + | |||
| + | print(f" | ||
| + | |||
| + | frames = [] | ||
| + | |||
| + | # Record audio in chunks and store it in frames | ||
| + | for _ in range(0, int(RATE / CHUNK * RECORD_SECONDS)): | ||
| + | data = stream.read(CHUNK) | ||
| + | frames.append(data) | ||
| + | |||
| + | print(" | ||
| + | |||
| + | # Stop and close the microphone stream | ||
| + | stream.stop_stream() | ||
| + | stream.close() | ||
| + | audio.terminate() | ||
| + | |||
| + | # Save the recorded audio to a WAV file | ||
| + | with wave.open(OUTPUT_FILENAME, | ||
| + | wf.setnchannels(CHANNELS) | ||
| + | wf.setsampwidth(audio.get_sample_size(FORMAT)) | ||
| + | wf.setframerate(RATE) | ||
| + | wf.writeframes(b'' | ||
| + | |||
| + | if __name__ == " | ||
| + | device_index = get_input_device_index() | ||
| + | record_audio(device_index) | ||
| + | |||
| + | </ | ||