new cleaning steps during the process of creating the embeddings
This commit is contained in:
parent
ad9e7d93aa
commit
c41ffcd5b9
27
embedding.py
27
embedding.py
@ -148,6 +148,29 @@ class EmbeddingModel:
|
||||
|
||||
self.collection.add(embeddings=embeddings, documents=documents, metadatas=metadatas, ids=ids)
|
||||
|
||||
def remove_duplicate(self, lst):
|
||||
# file_contents can contain duplicate lines
|
||||
# because we keep the textual content of multiple html tags that can be embedded one in another
|
||||
i = 0
|
||||
while i < len(lst) - 1:
|
||||
if i < len(lst) - 3 and lst[i] == lst[i + 2] and lst[i + 1] == lst[i + 3] == '':
|
||||
# Remove lst[i+1], lst[i+2], and lst[i+3]
|
||||
del lst[i + 1:i + 3]
|
||||
elif lst[i] == lst[i + 1]:
|
||||
# Remove lst[i+1]
|
||||
del lst[i + 1]
|
||||
else:
|
||||
i += 1
|
||||
return lst
|
||||
|
||||
def remove_footer(self, lst):
|
||||
sequence = ["Caisse d'Epargne", "Rhône Alpes", "Formuler une demande en ligne"]
|
||||
for i in range(len(lst) - 2):
|
||||
if lst[i:i + 3] == sequence:
|
||||
del lst[i:]
|
||||
break
|
||||
return lst
|
||||
|
||||
def embed_folder(self, folder_path):
|
||||
"""
|
||||
Embeds all the .txt files within a specified folder into a ChromaDB collection using a specified embedding model.
|
||||
@ -167,12 +190,16 @@ class EmbeddingModel:
|
||||
file_path = os.path.join(folder_path, filename)
|
||||
with open(file_path, 'r') as file:
|
||||
file_contents = file.read()
|
||||
file_contents = '\n'.join(self.remove_duplicate(file_contents.split('\n')))
|
||||
contents_lst = [str.replace('\n',' ').replace('\xa0', ' ') for str in file_contents.split('\n\n')]
|
||||
contents_lst = self.remove_footer(contents_lst)
|
||||
if len(contents_lst) < 3: # contents_lst[0] is the URL, contents_lst[1] is the title, the rest is the content
|
||||
continue
|
||||
url = contents_lst[0]
|
||||
if '?' in url: # URLs with a '?' corresponds to call to services and have no useful content
|
||||
continue
|
||||
if not url.startswith('https://www.caisse-epargne.fr/rhone-alpes/'):
|
||||
continue
|
||||
title = contents_lst[1]
|
||||
if not title: # when the title is absent (or empty), the page has no interest
|
||||
continue
|
||||
|
2
rag.py
2
rag.py
@ -127,7 +127,7 @@ Question reformulée : "
|
||||
|
||||
def answer(self, prompt, stream):
|
||||
response = self.llm(prompt = prompt,
|
||||
temperature = 0.7,
|
||||
temperature = 0.1,
|
||||
mirostat_mode = 2,
|
||||
stream = stream,
|
||||
max_tokens = -1,
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user