Documentation
LlamaIndex
Contents
LlamaIndex
LlamaIndex is a popular data framework for LLM applications. And now it works with DeepInfra.
First, install the necessary package:
pip install llama-index-llms-deepinfra
Set up the DeepInfraLLM
class with your API key and desired parameters:
from llama_index.llms.deepinfra import DeepInfraLLM
import asyncio
llm = DeepInfraLLM(
model="mistralai/Mixtral-8x22B-Instruct-v0.1", # Default model name
api_key="$DEEPINFRA_TOKEN", # Replace with your DeepInfra API key
temperature=0.5,
max_tokens=50,
additional_kwargs={"top_p": 0.9},
)
Generate a text completion synchronously using the complete
method:
response = llm.complete("Hello World!")
print(response.text)
Generate a streaming text completion synchronously using the stream_complete
method:
content = ""
for completion in llm.stream_complete("Once upon a time"):
content += completion.delta
print(completion.delta, end="")
Generate a chat response synchronously using the chat
method:
from llama_index.core.base.llms.types import ChatMessage
messages = [
ChatMessage(role="user", content="Tell me a joke."),
]
chat_response = llm.chat(messages)
print(chat_response.message.content)
Generate a streaming chat response synchronously using the stream_chat
method:
messages = [
ChatMessage(role="system", content="You are a helpful assistant."),
ChatMessage(role="user", content="Tell me a story."),
]
content = ""
for chat_response in llm.stream_chat(messages):
content += chat_response.delta
print(chat_response.delta, end="")
Generate a text completion asynchronously using the acomplete
method:
async def async_complete():
response = await llm.acomplete("Hello Async World!")
print(response.text)
asyncio.run(async_complete())
Generate a streaming text completion asynchronously using the astream_complete
method:
async def async_stream_complete():
content = ""
response = await llm.astream_complete("Once upon an async time")
async for completion in response:
content += completion.delta
print(completion.delta, end="")
asyncio.run(async_stream_complete())
Generate a chat response asynchronously using the achat
method:
async def async_chat():
messages = [
ChatMessage(role="user", content="Tell me an async joke."),
]
chat_response = await llm.achat(messages)
print(chat_response.message.content)
asyncio.run(async_chat())
Generate a streaming chat response asynchronously using the astream_chat
method:
async def async_stream_chat():
messages = [
ChatMessage(role="system", content="You are a helpful assistant."),
ChatMessage(role="user", content="Tell me an async story."),
]
content = ""
response = await llm.astream_chat(messages)
async for chat_response in response:
content += chat_response.delta
print(chat_response.delta, end="")
asyncio.run(async_stream_chat())
LlamaIndex can also work with DeepInfra embeddings models to get embeddings for your text data.
pip install llama-index llama-index-embeddings-deepinfra
from dotenv import load_dotenv, find_dotenv
from llama_index.embeddings.deepinfra import DeepInfraEmbeddingModel
_ = load_dotenv(find_dotenv())
model = DeepInfraEmbeddingModel(
model_id="BAAI/bge-large-en-v1.5", # Use custom model ID
api_token="YOUR_API_TOKEN", # Optionally provide token here
normalize=True, # Optional normalization
text_prefix="text: ", # Optional text prefix
query_prefix="query: ", # Optional query prefix
)
response = model.get_text_embedding("hello world")
print(response)
texts = ["hello world", "goodbye world"]
response_batch = model.get_text_embedding_batch(texts)
print(response_batch)
query_response = model.get_query_embedding("hello world")
print(query_response)
async def main():
text = "hello world"
async_response = await model.aget_text_embedding(text)
print(async_response)
if __name__ == "__main__":
import asyncio
asyncio.run(main())