This tutorial will guide you through building a Retrieval-Augmented Generation (RAG) application using the recently released transcripts from the Manhattan criminal trial of former President Donald J. Trump.
This tutorial is composed of two parts: indexing and retrieval
Indexing
We'll be using the premade "Multi-Page PDF Processing" template: https://mixpeek.com/explore/multi-page-pdf-processing
from mixpeek import Mixpeek, FileTools, SourceS3
def handler(event, context):
mixpeek = Mixpeek("API_KEY")
file_url = SourceS3.file_url(event['bucket'], event['key'])
pdf_data = FileTools.load_document(file_url)
num_pages = FileTools.document_page_count(pdf_data)
results = []
for page_number in range(1, num_pages + 1):
page_text = FileTools.extract_text(pdf_data, page_number)
page_embedding = mixpeek.embed.text(page_text, "jinaai/jina-embeddings-v2-base-en")
obj = {
"page_number": page_number,
"text": page_text,
"embedding": page_embedding,
"file_url": file_url
}
results.append(obj)
return results
Then we connect our S3 bucket and destination collection (MongoDB in this case).
We just drag the PDF in our S3 bucket and the pipeline is triggered, processing the document and sending the entire output into our database.
Retrieval
First we take the query the user provides and generate an embedding:
text_query = "who is Stormy Daniels?"
embedding = mixpeek.embed(
query=text_query,
model="jinaai/jina-embeddings-v2-base-en"
)
We'll take that plain text query and embedding then perform a hybrid search in MongoDB using Reciprocal Rank Fusion:
var vector_penalty = 1;
var full_text_penalty = 10;
results = db.embedded_documents.aggregate([
{
"$vectorSearch": {
"index": "rrf-vector-search",
"path": "plot_embedding",
"queryVector": embedding,
"numCandidates": 100,
"limit": 20
}
}, {
"$group": {
"_id": null,
"docs": {"$push": "$$ROOT"}
}
}, {
"$unwind": {
"path": "$docs",
"includeArrayIndex": "rank"
}
}, {
"$addFields": {
"vs_score": {
"$divide": [1.0, {"$add": ["$rank", vector_penalty, 1]}]
}
}
}, {
"$project": {
"vs_score": 1,
"_id": "$docs._id",
"title": "$docs.title"
}
},
{
"$unionWith": {
"coll": "documents",
"pipeline": [
{
"$search": {
"index": "rrf-full-text-search",
"phrase": {
"query": text_query,
"path": "title"
}
}
}, {
"$limit": 20
}, {
"$group": {
"_id": null,
"docs": {"$push": "$$ROOT"}
}
}, {
"$unwind": {
"path": "$docs",
"includeArrayIndex": "rank"
}
}, {
"$addFields": {
"fts_score": {
"$divide": [
1.0,
{"$add": ["$rank", full_text_penalty, 1]}
]
}
}
},
{
"$project": {
"fts_score": 1,
"_id": "$docs._id",
"title": "$docs.title"
}
}
]
}
},
{
"$group": {
"_id": "$title",
"vs_score": {"$max": "$vs_score"},
"fts_score": {"$max": "$fts_score"}
}
},
{
"$project": {
"_id": 1,
"title": 1,
"vs_score": {"$ifNull": ["$vs_score", 0]},
"fts_score": {"$ifNull": ["$fts_score", 0]}
}
},
{
"$project": {
"score": {"$add": ["$fts_score", "$vs_score"]},
"_id": 1,
"title": 1,
"vs_score": 1,
"fts_score": 1
}
},
{"$sort": {"score": -1}},
{"$limit": 10}
])
Generation
Use a model like gpt-4-turbo
to generate responses based on the retrieved documents, query, and structure the output into a summary and list of sources.
class Response(BaseModel):
title: str
answer: str
response = mixpeek.generate(
model={"provider": "GPT", "model": "gpt-4-turbo"},
response_format=PaperDetails,
context=f"Format this document and adhere to the provided JSON format: {file_output}",
)
then we get beautifully crafted, responses for our UI:
{
"title": "Who is Stormy Daniels?",
"answer": "Stormy Daniels, whose legal name is Stephanie Clifford, is a prominent figure primarily known for her involvement in a legal controversy with former U.S. President Donald Trump. She is an adult film actress and director, who came into the spotlight due to her allegations of an extramarital sexual encounter with Trump, which he has denied. Her case gained significant media attention when it was revealed that Trump's lawyer, Michael Cohen, had made a payment to Daniels to prevent her from discussing the alleged encounter publicly ahead of the 2016 presidential election. This led to various legal battles and discussions about potential campaign finance violations."
}