Enterprise Grade RAG in #Microsoft #Fabric using #CosmosDB and #DiskANN
In this video, we’re hardening our RAG architecture to meet the demands of the enterprise! Building on our previous video (https://youtu.be/jwVOQCUUH1Y), we are making our RAG for Microsoft Fabric enterprise grade.
this is the code used in this video:
%pip install langchain
%pip install langchain-core
%pip install langchain-experimental
%pip install langchain_openai
%pip install langchain-chroma
%pip install langchainhub
%pip install PyPDF2
%pip install --upgrade --quiet azure-cosmos langchain-openai langchain-community
import os, openai#, langchain, uuid
from synapse.ml.core.platform import find_secret
openai_key = find_secret(secret_name="YOUROPENAIKEY", keyvault="YOUR_KEYVAULT_NAME")
cosmosdb_key = find_secret(secret_name="YOURCOSMOSKEY", keyvault="YOUR_KEYVAULT_NAME")
openai_service_name = "YOUR_SERVICE_NAME"
openai_endpoint = "https://YOUR_SERVICE_NAME.openai.azure.com/"
openai_deployment_for_embeddings = "text-embedding-ada-002"
openai_deployment_for_query = "gpt-35-turbo"
openai_deployment_for_completions = "davinci-002" #"davinci-002"
openai_api_type = "azure"
openai_api_version = "2023-12-01-preview"
os.environ["OPENAI_API_TYPE"] = openai_api_type
os.environ["OPENAI_API_VERSION"] = openai_api_version
#os.environ["OPENAI_API_BASE"] = """"
os.environ["OPENAI_API_KEY"] = openai_key
os.environ["AZURE_OPENAI_ENDPOINT"] = openai_endpoint
base_path = "/lakehouse/default/Files/YOURFOLDER/"
del os.environ['OPENAI_API_BASE']
import bs4
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
#from langchain_openai import AzureOpenAIEmbeddings
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.llms import AzureOpenAI, OpenAI
from langchain_openai import AzureOpenAIEmbeddings
from PyPDF2 import PdfReader
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
folder_path = base_path
def load_pdfs_from_folder(folder_path):
documents = []
for filename in os.listdir(folder_path):
if filename.endswith('.pdf'):
file_path = os.path.join(folder_path, filename)
reader = PdfReader(file_path)
text = ""
for page in reader.pages:
text += page.extract_text()
document = Document(page_content=text, metadata={"document_name": filename})
documents.append(document)
return documents
# Load documents
documents = load_pdfs_from_folder(folder_path)
# Print the content of each document
for doc in documents:
print(f"Document Name: {doc.metadata['document_name']}")
#print(doc.page_content)
print("\n---\n")
indexing_policy = {
"indexingMode": "consistent",
"includedPaths": [{"path": "/*"}],
"excludedPaths": [{"path": '/"_etag"/?'}],
"vectorIndexes": [{"path": "/embedding", "type": "diskANN"}],
}
vector_embedding_policy = {
"vectorEmbeddings": [
{
"path": "/embedding",
"dataType": "float32",
"distanceFunction": "cosine",
"dimensions": 1536,
}
]
}
from azure.cosmos import CosmosClient, PartitionKey
from langchain_community.vectorstores.azure_cosmos_db_no_sql import (
AzureCosmosDBNoSqlVectorSearch,
)
from langchain_openai import AzureOpenAIEmbeddings
HOST = "https://YOURCOSMOSDB.documents.azure.com:443/"
KEY = cosmosdb_key
cosmos_client = CosmosClient(HOST, KEY)
database_name = "YOURCOSMOSDBNAME"
container_name = "YOURCONTAINER"
partition_key = PartitionKey(path="/id")
cosmos_container_properties = {"partition_key": partition_key}
cosmos_database_properties = {"id": database_name}
openai_embeddings = AzureOpenAIEmbeddings()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200)
splits = text_splitter.split_documents(documents)
# insert the documents in AzureCosmosDBNoSql with their embedding
vector_search = AzureCosmosDBNoSqlVectorSearch.from_documents(
documents=splits,
embedding=openai_embeddings,
cosmos_client=cosmos_client,
database_name=database_name,
container_name=container_name,
vector_embedding_policy=vector_embedding_policy,
indexing_policy=indexing_policy,
cosmos_container_properties=cosmos_container_properties,
cosmos_database_properties=cosmos_database_properties,
)
from langchain.schema import HumanMessage
import openai
display(answers[0].page_content)
from langchain_openai import AzureChatOpenAI
from langchain.schema import HumanMessage
import openai
llm = AzureChatOpenAI(azure_deployment=openai_deployment_for_query)
retriever = vector_search.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
message = HumanMessage(
content="Tell me what you know about Prohabits."
)
result = llm.invoke([message])
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
rag_chain.invoke("What is Prohabits?")