Set API keys
pdf_files = glob(os.path.join("data", "*.pdf"))
client = qdrant_client.QdrantClient("qdrant_host:qdrant_port")
openai_api_key = "openai_api_key"
infomaniak_api_key = "infomaniak_api_key"
infomaniak_product_id = "infomaniak_product_id"March 16, 2025
import hashlib
import time
import uuid
import qdrant_client
from qdrant_client.http.models import Distance, VectorParams
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import pdfplumber
from openai import OpenAI
import requests
import json
from glob import glob
import os
import tiktoken
from transformers import AutoTokenizer
# Extract text from a PDF
def extract_text_from_pdf(pdf_path):
text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text += page.extract_text()
return text
# Chunk text into manageable sizes
def chunk_text(text, chunk_size=250):
words = text.split()
return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
# Generate embeddings using OpenAI API
def generate_embeddings(chunks, model_name, api_key):
# Initialize the OpenAI client
client = OpenAI(api_key=api_key)
embeddings = []
for chunk in chunks:
try:
response = client.embeddings.create(
input=chunk,
model=model_name
)
embeddings.append(response.data[0].embedding)
except Exception as e:
print(f"Error generating embedding: {e}")
embeddings.append(None)
return embeddings
# Generate embeddings using Infomaniak API
def generate_embeddings_v2(chunks, product_id, model_name, api_token, chunk_count=0,rate=None):
url = f"https://api.infomaniak.com/1/ai/{product_id}/openai/v1/embeddings"
headers = {"Authorization": f"Bearer {api_token}", "Content-Type": "application/json"}
embeddings = []
for chunk in chunks:
if rate is not None and chunk_count == rate:
print("waiting one minute")
time.sleep(60) # Wait for one minute
chunk_count = 0 # Reset the counter after waiting
try:
data = json.dumps({"input": [chunk], "model": model_name})
response = requests.post(url, headers=headers, data=data)
if response.status_code == 200:
res_json = response.json()
embeddings.append(res_json["data"][0]["embedding"])
else:
print(f"Error: {response.status_code} - {response.text}")
embeddings.append(None)
except Exception as e:
print(f"Error generating embedding: {e}")
embeddings.append(None)
chunk_count += 1
return chunk_count, embeddings
# Store embeddings in Qdrant
def recreate_collection(collection_prefix, model_name, vector_size):
collection_name = f"{collection_prefix}-{model_name}"
client.recreate_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
)
def store_embeddings_in_qdrant_v2(collection_prefix, model_name, embeddings, chunks):
collection_name = f"{collection_prefix}-{model_name}"
points = [
{"id": str(uuid.UUID(hashlib.sha256(chunks[idx].encode('utf-8')).hexdigest()[:32])),
"vector": vector, "payload": {"text": chunks[idx]}}
for idx, vector in enumerate(embeddings) if vector is not None
]
client.upsert(collection_name=collection_name, points=points)
print(f"Embeddings stored in collection: {collection_name}")
def visualize_embeddings(client, collection_name):
print(collection_name)
all_points = []
limit = 544 # Adjust based on expected collection size
offset = 0
while True:
response = client.scroll(
collection_name=collection_name,
with_vectors=True,
limit=limit,
offset=offset
)
points, _ = response
if not points:
break
print(len(all_points))
all_points.extend(points)
offset += len(points) # Increment offset for the next batch
break
if not all_points:
print(f"No points found in collection: {collection_name}")
return
# Extract vectors from the fetched points
vectors = np.array([point.vector for point in all_points if point.vector is not None])
if vectors.size == 0:
print("No valid vectors to visualize.")
return
# Reduce dimensions for visualization
reduced_vectors = PCA(n_components=2).fit_transform(vectors)
plt.scatter(reduced_vectors[:, 0], reduced_vectors[:, 1], alpha=0.5)
plt.title(f"Visualization of Embeddings ({collection_name})")
plt.show()
def recreate_collections():
recreate_collection("test", "text-embedding-ada-002", 1536)
recreate_collection("test", "text-embedding-3-large", 3072)
recreate_collection("test", "mini_lm_l12_v2", 384)
recreate_collection("test", "bge_multilingual_gemma2", 3584)
def extract_text_in_chunks(pdf_path):
raw_text = extract_text_from_pdf(pdf_path)
chunks = chunk_text(raw_text)
return chunks
def get_nb_tokens(chunks, model_name):
tokenizer = tiktoken.get_encoding(model_name)
nb_token=0
for chunk in chunks:
num_tokens = len(tokenizer.encode(chunk))
nb_token+=num_tokens
return f"Number of tokens: {nb_token}"
def get_nb_tokens_v2(chunks, model_name):
tokenizer = AutoTokenizer.from_pretrained(model_name)
nb_token=0
for chunk in chunks:
tokens = tokenizer.tokenize(chunk)
token_count = len(tokens)
nb_token+=token_count
return f"Number of tokens: {nb_token}"
def retrieve_text_documents(qdrant_client,points,collection_name):
retrieved_documents = []
for result in points:
document = qdrant_client.retrieve(
collection_name=collection_name,
ids=[result.id] # Fetch document(s) by ID
)
retrieved_documents.append({
'id': result.id,
'score': result.score,
'content': document[0].payload # Adjust key as needed
})
return retrieved_documents
def retrieve_qa_documents(qdrant_client, collection_name: str, prompt,model_name, api_key):
client = OpenAI(api_key=api_key)
response = client.embeddings.create(
input=prompt,
model=model_name
)
embeddings=response.data[0].embedding
# Perform a vector search in the Qdrant collection
search_results = qdrant_client.search(
collection_name=collection_name,
query_vector=embeddings, # Wrap the vector in NamedVector
limit=3 # Number of top results to retrieve
)
# Extract the results in a human-readable format
retrieved_documents = retrieve_text_documents(qdrant_client,search_results,collection_name)
return retrieved_documents
def retrieve_qa_documents_v2(qdrant_client, collection_name: str, prompt,model_name, api_token,product_id):
url = f"https://api.infomaniak.com/1/ai/{product_id}/openai/v1/embeddings"
headers = {"Authorization": f"Bearer {api_token}", "Content-Type": "application/json"}
data = '{"input": ["'+prompt+'"],"model": "'+model_name+'"}'
response = requests.post(url, headers=headers, data=data)
if response.status_code == 200:
res_json = response.json()
embeddings=res_json["data"][0]["embedding"]
# Perform a vector search in the Qdrant collection
search_results = qdrant_client.search(
collection_name=collection_name,
query_vector=embeddings, # Wrap the vector in NamedVector
limit=3 # Number of top results to retrieve
)
# Extract the results in a human-readable format
retrieved_documents = retrieve_text_documents(qdrant_client,search_results,collection_name)
return retrieved_documents
return None
def call_llm(prompt, documents, model, openai_api_key):
document_texts = "\n".join([doc['content']['text'] for doc in documents])
constructed_prompt = f"CONTENT:\n{document_texts}\n\nUSER PROMPT:\n{prompt}"
client = OpenAI(
api_key=openai_api_key, # This is the default and can be omitted
)
response = client.chat.completions.create(
messages=[
{
"role": "user",
"content": constructed_prompt,
}
],
model=model,
)
return response.choices[0].message.content
def call_llm_v2(prompt, documents, model, product_id, api_key):
document_texts = "\n".join([doc['content']['text'] for doc in documents])
constructed_prompt = f"CONTENT:\n{document_texts}\n\nUSER PROMPT:\n{prompt}\n\nbased on only the content given by the user, create an answer for the prompt"
URL = f"https://api.infomaniak.com/1/ai/{product_id}/openai/chat/completions"
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
data = {
"messages": [
{
"content": constructed_prompt,
"role": "user"
}
],
"model": model
}
# Convert dictionary to JSON string
json_data = json.dumps(data)
req = requests.request("POST", url = URL , data = json_data, headers = headers)
res = req.json()
return res["choices"][0]["message"]["content"]Since the highest average score in the previous test was for the question about Shakespeare and the Titanic, we will use this question and the documents retrieved with bge_multilingual_gemma2 to test some LLM models to generate answers. This question is also useful to test both creativity and the incorporation of factual data from the QA retrieval system at the same time.
prompt="If Shakespeare had written about the Titanic tragedy, how might he have described the events and their emotional impact?"
documents=retrieve_qa_documents_v2(client,"test-bge_multilingual_gemma2",prompt,"bge_multilingual_gemma2",infomaniak_api_key,infomaniak_product_id)
print(call_llm(prompt,documents,"gpt-4o", openai_api_key))
print(call_llm(prompt,documents,"gpt-4o-mini", openai_api_key))
print(call_llm_v2(prompt,documents,"mixtral8x22b", infomaniak_product_id, infomaniak_api_key))
print(call_llm_v2(prompt,documents,"mixtral", infomaniak_product_id, infomaniak_api_key))
print(call_llm_v2(prompt,documents,"llama3", infomaniak_product_id, infomaniak_api_key))Amidst the vast and uncharted seas, a vessel grand and fated, named Titanic, did set forth upon its maiden journey—a floating palace draped in illusions of invincibility. Yet, beneath the cloak of iron and opulence lay a tragic flaw, akin to the hubris of Ancient Greece, where overconfidence beguiles reason.
Lo, in the midnight hour, disaster struck with an icy specter lurking, a colossal iceberg—Nature’s vengeful monolith—did pierce the ship’s formidable armor. The heavens themselves wept as the ship of dreams descended into the abyss, marking the timely end of human ambition’s most glorious folly.
Oh, the despair! As lifeboats, scant though they were, bore witness to the frailty of man’s preparation. They departed their creator at nary half their fill, leaving souls adrift in a sea of agony, tokens of privilege and oversight. The survivors’ souls cried out to the indifferent skies, their lament mingled with the cries of the lost, an anguished chorus of disunion.
And what of the unseeing seers, the nearby vessels who misconstrued Titan’s flaming cries for help, their chance to intervene, alas, drowned by the night? Like specters of a tragic chorus, they did bear witness too late to the unraveling of human pretense.
See how the heroism of Carpathia hastens to salvage what fate had not claimed, an ark of hope amidst the wreckage. With trembling hands and hearts entwined with grief, it did embrace the forsaken, delivering them to safer shores yet forever scarred.
The morn reveals the sobering truth by which empires are humbled—the once-glorious Titanic lies beneath the waves, an eerie testament to the echoes of human imperfection. In the tragedy’s wake, we, from the comfort of retrospection, glean poignant lessons from dreams undone. Thus, Time’s relentless march continues, quietly etching memories upon the pages of history.
Had Shakespeare turned his quill to the tragic tale of the Titanic, he might have woven a haunting sonnet that captures the fleeting nature of hubris and the depths of despair faced by its fateful passengers.
In the manner of his rich, poetic imagery, the Bard would likely have depicted the great vessel, a “Titan of the seas,” sailing with unmatched pride, its “watertight compartments” singing songs of invulnerability, yet carrying within its heart a “fateful flaw.” The tragic irony of its boastful promise of safety could echo through the verses, lamenting, “What dream dost thou pursue, o wondrous ship, / Magnet for the hearts of men, who cling to hope?”
As the night wore on, the fateful collision with icy fate would be rendered as a scene drenched in ominous foreboding. He might craft characters reflecting the souls aboard—“fearful hearts embracing darkness”—their screams lost amidst the stark cries of the sea. The lifeboats, “filled but a meager sixty percent,” would symbolize the shallow salvation that life offers in dire moments, a poignant reminder of fractured destinies and lives cast adrift.
The arrival of the RMS Carpathia could be woven into a narrative of bittersweet rescue—a “beacon amidst the darkened waves”—yet the toll of tragedy would ring clear: “O, list ye! One thousand five hundred souls swept away, / The ocean claims what pride did not foretell.”
Shakespeare would delve into the emotional turmoil of those left behind and those who survived. The narrative would resonate with the haunting echoes of lost voices—“Lost! Lost! How can we name the pain of emptiness?”—capturing the essence of grief, confusion, and the harsh light of truth that follows calamity.
He might conclude with a poignant reflection on the very nature of existence, love, and mortality, encapsulating the tragedy’s enduring legacy: “What vanity doth tread upon the waves, / When even Titans fall, and man is but a moment’s breath?” In this way, the Bard would remind us of our shared humanity, forever linked to the tales of both triumph and tragedy upon the merciless seas.
In the tempest of the Atlantic, a marvel of man’s creation, the Titanic, met its tragic fate. The vessel, deemed unsinkable, bore the weight of human pride and ambition, yet succumbed to the wrath of nature’s icy grasp.
The ship’s distress calls echoed through the night, like desperate cries from a drowning heart, seeking solace in the vast emptiness of the sea. Yet, the cruel hand of fate played its part, as vessels nearby, though alerted, remained distant spectators to the unfolding tragedy.
The Californian, the last to share communion with the Titanic, stood witness to her final symphony of distress flares, yet failed to lend a helping hand. The Carpathia, responding to the Titanic’s earlier pleas, arrived too late to prevent the inevitable.
As the Titanic sank, its lifeboats, symbols of hope, were left half-filled, mirroring the incomplete dreams of those aboard. The fortunate few, 706 souls, were granted passage to New York, the Titanic’s intended haven, while 1,517 others were swallowed by the merciless sea.
In the aftermath, the Carpathia, a messenger of sorrow, traversed the treacherous waters to deliver the grim news. The world mourned, their hearts heavy with the weight of loss, as they grappled with the reality of the tragedy.
Had Shakespeare penned this tale, he would have painted a picture of human resilience amidst despair, of hope clinging onto the edge of desperation. He would have captured the essence of the tragedy, the collective sigh of a world confronted with its own mortality. The sinking of the Titanic would have been more than just a maritime disaster; it would have been a poignant reminder of the fragility of human endeavour in the face of nature’s indifference.
Shakespeare, with his eloquent and dramatic style, might have described the Titanic tragedy as follows:
“In the darkest hour of the night, a star of hope was extinguished, and the unsinkable colossus, Titanic, met her watery grave. A ship of dreams, filled with lives and stories, was swallowed by the abyss, leaving naught but whispers of its grandeur.
The fateful night, a dance of shadows and light, played host to the collision of man’s arrogance and nature’s wrath. The mighty Titanic, a testament to human ingenuity, was brought low by an iceberg, a silent sentinel of the deep.
The ship’s final moments were a symphony of chaos and despair. Lifeboats, barely filled, were cast into the churning sea, while those left behind faced the cold embrace of the Atlantic. The cries of the lost echoed through the night, a mournful serenade to the heavens.
Yet, amidst the tragedy, stories of courage and resilience emerged. The brave souls who rowed through the darkness, the officers who gave their places on the lifeboats, and the crew who tended to the passengers until the end. These are the tales that will live on, a beacon of hope in the face of despair.
In the aftermath, the world stood still, stunned by the magnitude of the disaster. The once proud Titanic, now a memory, served as a stark reminder of the fragility of life and the limits of human ambition.
So, let us remember the Titanic, not for her tragic end, but for the lives she carried, the dreams she held, and the lessons she taught. For in her watery grave, she left behind a legacy that will forever be etched in the annals of history.”
If Shakespeare had written about the Titanic tragedy, he might have described the events in a dramatic and poetic manner, emphasizing the human emotions and tragic flaws that led to the disaster. He could have portrayed the Titanic as a mighty and proud vessel, deemed “unsinkable” by its creators, but ultimately brought down by hubris and the unforgiving power of nature.
The collision with the iceberg might have been described as a catastrophic event, akin to a Greek tragedy, where the gods themselves seemed to conspire against the ship. The distress calls and responses from nearby vessels could have been depicted as a chorus of desperation and frustration, highlighting the tragic delay in rescue efforts.
Shakespeare might have also explored the emotional toll on the passengers and crew, particularly those who were left stranded on the sinking ship, awaiting their fate. The scenes of chaos, panic, and despair as the reality of the situation set in could have been vividly described, with characters lamenting their impending doom and the loss of loved ones.
The role of the SS Californian, which failed to respond to the Titanic’s distress calls, might have been portrayed as a symbol of human indifference and neglect, underscoring the theme of tragic responsibility. The eventual arrival of the RMS Carpathia, which rescued the survivors, could have been seen as a beacon of hope and redemption, highlighting the resilience of the human spirit in the face of adversity.
Throughout the narrative, Shakespeare would likely have woven a complex tapestry of emotions, from the euphoria of the ship’s maiden voyage to the crushing despair of its tragic end. The Titanic’s story would have been transformed into a timeless tale of human folly, tragedy, and the enduring power of the human heart.
As done for the answers in the last chapter, claude.ai was used to rate the answers given by the different LLMs, given emphasis to two points: that the question was properly answered and the content available in the prompt was used for the answer.
For the LLM usage two different prices are taken into account: input tokens and output tokens. During the previous test here are the different tokens used:
Microsoft Azure has made significant strides in environmental sustainability, aiming to achieve carbon neutrality by 2012 and 100% renewable energy by 2025. They are also committed to being water positive by 2030 and achieving zero waste certification. Azure’s data centres are designed with energy efficiency in mind, utilizing advanced cooling technologies and optimizing resource allocation. By choosing Azure, businesses can contribute to a greener future while benefiting from the advantages of cloud technology.
Infomaniak positions itself as a more environmentally conscious cloud provider, with a stronger focus on sustainability and renewable energy. Their data centres are located in Switzerland, known for its strict environmental regulations, and they actively promote the use of renewable energy sources. Infomaniak emphasizes transparency about their environmental impact and offsets emissions by 200%. While they acknowledge the significant environmental impact of server manufacturing, their overall approach and commitment to sustainability make them a compelling choice for businesses seeking an environmentally friendly cloud solution.
Hosting with Infomaniak ensures your data remains within Switzerland, adhering to Swiss data protection laws. Swiss law provides strong data protection guarantees, reducing exposure to extra-territorial laws like the USA PATRIOT Act. This contrasts with using OpenAI’s models, which are hosted on U.S.-based infrastructure and subject to U.S. jurisdiction, including compliance with laws such as the PATRIOT Act and FISA.
In comparison, deploying Mistral AI, an open-source LLM created by a French company, on Infomaniak’s infrastructure offers a privacy-centric alternative. Mistral operates under European regulations, ensuring compliance with GDPR, and when hosted on Infomaniak, benefits from Swiss data protection laws. This setup minimizes risks associated with foreign data access requests and enhances control over data processing and storage.
Choosing OpenAI provides access to cutting-edge proprietary models but comes with potential legal complexities tied to U.S. data access laws. Meanwhile, hosting Mistral AI on Infomaniak offers greater sovereignty, transparency, and compliance, fostering trust and security for both your users and your business.