# Build a Q&A application with SageMaker Jumpstart, Langchain and FAISS index

This notebook explains steps requried to build a Question & Answer application using Retrieval Augmented Generation (RAG) architecture.
RAG combines the power of pre-trained LLMs with information retrieval - enabling more accurate and context-aware responses

(This notebook was tested on SageMaker Studio ml.m5.2xlarge instance with Datascience 3.0 kernel)

## Pre-requisites

In [None]:
!pip install faiss-cpu
!pip install langchain --upgrade
!pip install pypdf

In [None]:
!pip install sentence_transformers

In [None]:
!pip install sagemaker --upgrade

## Restart Kernel

In [None]:
#Restart Kernel after the installs
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True) 

## Setup depedencies

In [None]:
#Check Python version is greater than 3.8 which is required by Langchain if you want to use Langchain
import sys
sys.version

In [None]:
assert sys.version_info >= (3, 8)

In [None]:
import langchain

In [None]:
langchain.__version__

## Deploy SageMaker Jumpstart model to an endpoint

In [None]:
import os
import time
import sagemaker
import boto3
import json
from sagemaker.session import Session
from sagemaker.model import Model
from sagemaker import image_uris, model_uris, script_uris,instance_types, hyperparameters
from sagemaker.predictor import Predictor
from sagemaker.utils import name_from_base

sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()
aws_region = boto3.Session().region_name
sess = sagemaker.Session()
sm_client = boto3.client("runtime.sagemaker")

In [None]:
#sm_llm_model_id = "huggingface-textgeneration1-bloomz-7b1-fp16" #"huggingface-text2text-flan-ul2-bf16" #"huggingface-textgeneration1-bloomz-7b1-fp16" #"huggingface-text2text-flan-t5-xxl"
#sm_llm_model_id = "huggingface-text2text-flan-ul2-bf16" #"huggingface-textgeneration1-bloomz-7b1-fp16" #"huggingface-text2text-flan-t5-xxl"
model_id = "huggingface-text2text-flan-t5-xl"
model_version = "*"
endpoint_name = f'sm-jumpstart-langchain-{model_id}'

In [None]:
# Retrieve the inference instance type for the specified model.
instance_type = instance_types.retrieve_default(
 model_id=model_id, model_version=model_version, scope="inference"
)
#instance_type = 'ml.g5.24xlarge'
instance_type

In [None]:
env = {"MMS_DEFAULT_WORKERS_PER_MODEL": "1"}

# Get Image URI
image_uri = image_uris.retrieve(region=aws_region,framework=None,image_scope="inference",model_id=model_id,model_version=model_version,instance_type=instance_type)

# Get model uri.
model_uri = model_uris.retrieve(model_id=model_id,model_version=model_version,model_scope="inference" )

#Create Model
model = Model(image_uri=image_uri,model_data=model_uri,role=aws_role,predictor_cls=Predictor,name=endpoint_name,env=env)

print(f'Image URI {image_uri}')
print(f'Model URI {model_uri}')

In [None]:
#Deploy the model
predictor = model.deploy(
 initial_instance_count=1,
 instance_type=instance_type,
 predictor_cls=Predictor,
 endpoint_name=endpoint_name,
)

## Perform document pre-processing
Load the documents, perform clean-up of the text before generating embeddings

In [None]:
import os, json
from tqdm import tqdm
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter,CharacterTextSplitter,NLTKTextSplitter
import pathlib 

In [None]:
from langchain.vectorstores import FAISS
index_path = 'faiss_indices'

In [None]:
# Put your directory containing PDFs here
index_name = 'firetv'
directory = f'pdfs/{index_name}'

If you have previously generated embeddings and saved the document embeddings locally, skip the following section and go to Generate Embeddings section

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
 chunk_size=800,
 chunk_overlap=100,
 #separators=["\n\n", "\n", ".", "!", "?", " ", ",", ""],
 length_function=len,
 keep_separator=False,
 add_start_index=False
)


In [None]:
pdf_documents = [os.path.join(directory, filename) for filename in os.listdir(directory)]
pdf_documents

In [None]:
langchain_documents = []
for document in pdf_documents:
 loader = PyPDFLoader(document)
 data = loader.load()
 langchain_documents.extend(data)


In [None]:
print("loaded document pages: ", len(langchain_documents))
print("Splitting all documents")
split_docs = text_splitter.split_documents(langchain_documents)
print("Num split pages: ", len(split_docs))

In [None]:
split_docs[0].page_content

In [None]:
import regex as re
for d in split_docs:
 text = d.page_content
 text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
 text = re.sub(r"(? bytes:
 input_str = json.dumps({"text_inputs": prompt, **model_kwargs})
 return input_str.encode('utf-8')

 def transform_output(self, output: bytes) -> str:
 response_json = json.loads(output.read().decode("utf-8"))
 return response_json["generated_texts"][0]

content_handler = ContentHandler()

llm=SagemakerEndpoint(
 endpoint_name=endpoint_name, 
 region_name=aws_region, 
 model_kwargs={"temperature":temperature, "max_length": max_length, "top_p": top_p, "top_k":top_k},
 content_handler=content_handler)


### Method 1- Simple query with Vector store wrapper

In [None]:
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
wrapper_store = VectorStoreIndexWrapper(vectorstore=db_local)

response = wrapper_store.query(question=query, llm=llm)
print(response)

### Method 2- Query with chain

In [None]:
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff")
documents = db_local.similarity_search(query=query, k=5)
print(chain.run(input_documents=documents, question=query))

### Method 3- Query with Prompt template (Provides prompt customization)

In [None]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

prompt_template = """Human: Use the following pieces of context to provide a concise answer to the question at the end. 

{context}

Question: {question}
Assistant:"""
PROMPT = PromptTemplate(
 template=prompt_template, input_variables=["context", "question"]
)

In [None]:
qa = RetrievalQA.from_chain_type(
 llm=llm,
 chain_type="stuff",
 retriever=db_local.as_retriever(
 search_type="similarity", search_kwargs={"k": 3}
 ),
 return_source_documents=True,
 chain_type_kwargs={"prompt": PROMPT}
)

response = qa({'query':query})
print(response['result'])

In [None]:
response['source_documents']

## Implement RAG architecture with Kendra Index

In [None]:
kendra_index = "" #Provide Kendra index here

In [None]:
from langchain.schema.document import Document

kendra = boto3.client('kendra')
response = kendra.retrieve(IndexId=kendra_index,QueryText=query)
docs = [Document(page_content = r['Content']) for r in response['ResultItems']]
docs

In [None]:
chain = load_qa_chain(llm, chain_type="stuff")
print(chain.run(input_documents=docs, question=query))

## Clean-Up (Optional)
Delete the model and the endpoint

In [None]:
# Cleanup
predictor.delete_model()
predictor.delete_endpoint()