Published on 2024.09.20 by Askar Aitzhan
In this tutorial, we'll walk you through the process of creating a voice assistant using three powerful AI technologies:
All the models are available on DeepInfra. But we will use OpenAI's python client to interact with LLM. And ElevenLabs' python client to interact with TTS.
Before we begin, make sure you have the following installed and set up:
python3 -m venv .venv
source .venv/bin/activate
brew install portaudio
pip install openai elevenlabs pyaudio numpy deepinfra scipy requests
You'll also need to set up API key for DeepInfra.
First, let's use Whisper to transcribe user speech:
import pyaudio
import wave
import numpy as np
import requests
import json
import io
from scipy.io import wavfile
from openai import OpenAI
from elevenlabs import ElevenLabs, play
DEEPINFRA_API_KEY = "YOUR_DEEPINFRA_TOKEN"
WHISPER_MODEL = "distil-whisper/distil-large-v3"
def record_audio(duration=5, sample_rate=16000):
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
p = pyaudio.PyAudio()
print("Recording...")
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=sample_rate,
input=True,
frames_per_buffer=CHUNK)
frames = []
for i in range(0, int(sample_rate / CHUNK * duration)):
data = stream.read(CHUNK)
frames.append(data)
print("Recording complete.")
stream.stop_stream()
stream.close()
p.terminate()
# Convert to numpy array
audio = np.frombuffer(b''.join(frames), dtype=np.int16)
return audio
def transcribe_audio(audio):
# Convert numpy array to WAV file in memory
buffer = io.BytesIO()
wavfile.write(buffer, 16000, audio.astype(np.int16))
buffer.seek(0)
# Prepare the request
url = f'https://api.deepinfra.com/v1/inference/{WHISPER_MODEL}'
headers = {
"Authorization": f"bearer {DEEPINFRA_API_KEY}"
}
files = {
'audio': ('audio.wav', buffer, 'audio/wav')
}
# Send the request
response = requests.post(url, headers=headers, files=files)
if response.status_code == 200:
result = json.loads(response.text)
return result['text']
else:
print(f"Error: {response.status_code}")
print(response.text)
return None
audio = record_audio()
transcription = transcribe_audio(audio)
print(f"Transcription: {transcription}")
Now, let's use the OpenAI client to interact with an LLM:
openai_client = OpenAI(api_key=DEEPINFRA_API_KEY, base_url="https://api.deepinfra.com/v1/openai")
MODEL_DI = "meta-llama/Meta-Llama-3.1-70B-Instruct"
def chat_with_llm(user_input):
response = openai_client.chat.completions.create(
model=MODEL_DI,
messages=[{"role": "user", "content": user_input}],
max_tokens=1000,
)
return response.choices[0].message.content
llm_response = chat_with_llm(transcription)
print(f"LLM Response: {llm_response}")
client = ElevenLabs(api_key=DEEPINFRA_API_KEY, base_url="https://api.deepinfra.com")
def text_to_speech(text):
audio = client.generate(
text=text,
voice="luna",
model="deepinfra/tts"
)
play(audio)
text_to_speech(llm_response)
Now, let's combine all these steps into a single voice assistant function:
def voice_assistant():
while True:
# Record and transcribe audio
audio = record_audio()
transcription = transcribe_audio(audio)
print(f"You said: {transcription}")
# Chat with LLM
llm_response = chat_with_llm(transcription)
print(f"Assistant: {llm_response}")
# Convert response to speech
text_to_speech(llm_response)
# Ask if the user wants to continue
if input("Continue? (y/n): ").lower() != 'y':
break
voice_assistant()
This voice assistant will continuously listen for user input, transcribe it, process it with an LLM, and respond with synthesized speech until the user chooses to stop.
Remember to replace YOUR_DEEPINFRA_TOKEN
with your actual API key.
By leveraging the power of Whisper for speech recognition, LLM for intelligent conversation, and TTS for natural-sounding text-to-speech, you can create a sophisticated voice assistant capable of understanding and responding to a wide range of user queries.