new cleaning steps during the process of creating the embeddings

This commit is contained in:
Pierre-Edouard Portier 2024-01-07 20:31:08 +01:00
parent ad9e7d93aa
commit c41ffcd5b9
3 changed files with 2822 additions and 42 deletions

View File

@ -148,6 +148,29 @@ class EmbeddingModel:
self.collection.add(embeddings=embeddings, documents=documents, metadatas=metadatas, ids=ids)
def remove_duplicate(self, lst):
# file_contents can contain duplicate lines
# because we keep the textual content of multiple html tags that can be embedded one in another
i = 0
while i < len(lst) - 1:
if i < len(lst) - 3 and lst[i] == lst[i + 2] and lst[i + 1] == lst[i + 3] == '':
# Remove lst[i+1], lst[i+2], and lst[i+3]
del lst[i + 1:i + 3]
elif lst[i] == lst[i + 1]:
# Remove lst[i+1]
del lst[i + 1]
else:
i += 1
return lst
def remove_footer(self, lst):
sequence = ["Caisse d'Epargne", "Rhône Alpes", "Formuler une demande en ligne"]
for i in range(len(lst) - 2):
if lst[i:i + 3] == sequence:
del lst[i:]
break
return lst
def embed_folder(self, folder_path):
"""
Embeds all the .txt files within a specified folder into a ChromaDB collection using a specified embedding model.
@ -167,12 +190,16 @@ class EmbeddingModel:
file_path = os.path.join(folder_path, filename)
with open(file_path, 'r') as file:
file_contents = file.read()
file_contents = '\n'.join(self.remove_duplicate(file_contents.split('\n')))
contents_lst = [str.replace('\n',' ').replace('\xa0', ' ') for str in file_contents.split('\n\n')]
contents_lst = self.remove_footer(contents_lst)
if len(contents_lst) < 3: # contents_lst[0] is the URL, contents_lst[1] is the title, the rest is the content
continue
url = contents_lst[0]
if '?' in url: # URLs with a '?' corresponds to call to services and have no useful content
continue
if not url.startswith('https://www.caisse-epargne.fr/rhone-alpes/'):
continue
title = contents_lst[1]
if not title: # when the title is absent (or empty), the page has no interest
continue

2
rag.py
View File

@ -127,7 +127,7 @@ Question reformulée : "
def answer(self, prompt, stream):
response = self.llm(prompt = prompt,
temperature = 0.7,
temperature = 0.1,
mirostat_mode = 2,
stream = stream,
max_tokens = -1,

File diff suppressed because it is too large Load Diff