new cleaning steps during the process of creating the embeddings
This commit is contained in:
parent
ad9e7d93aa
commit
c41ffcd5b9
27
embedding.py
27
embedding.py
|
@ -148,6 +148,29 @@ class EmbeddingModel:
|
||||||
|
|
||||||
self.collection.add(embeddings=embeddings, documents=documents, metadatas=metadatas, ids=ids)
|
self.collection.add(embeddings=embeddings, documents=documents, metadatas=metadatas, ids=ids)
|
||||||
|
|
||||||
|
def remove_duplicate(self, lst):
|
||||||
|
# file_contents can contain duplicate lines
|
||||||
|
# because we keep the textual content of multiple html tags that can be embedded one in another
|
||||||
|
i = 0
|
||||||
|
while i < len(lst) - 1:
|
||||||
|
if i < len(lst) - 3 and lst[i] == lst[i + 2] and lst[i + 1] == lst[i + 3] == '':
|
||||||
|
# Remove lst[i+1], lst[i+2], and lst[i+3]
|
||||||
|
del lst[i + 1:i + 3]
|
||||||
|
elif lst[i] == lst[i + 1]:
|
||||||
|
# Remove lst[i+1]
|
||||||
|
del lst[i + 1]
|
||||||
|
else:
|
||||||
|
i += 1
|
||||||
|
return lst
|
||||||
|
|
||||||
|
def remove_footer(self, lst):
|
||||||
|
sequence = ["Caisse d'Epargne", "Rhône Alpes", "Formuler une demande en ligne"]
|
||||||
|
for i in range(len(lst) - 2):
|
||||||
|
if lst[i:i + 3] == sequence:
|
||||||
|
del lst[i:]
|
||||||
|
break
|
||||||
|
return lst
|
||||||
|
|
||||||
def embed_folder(self, folder_path):
|
def embed_folder(self, folder_path):
|
||||||
"""
|
"""
|
||||||
Embeds all the .txt files within a specified folder into a ChromaDB collection using a specified embedding model.
|
Embeds all the .txt files within a specified folder into a ChromaDB collection using a specified embedding model.
|
||||||
|
@ -167,12 +190,16 @@ class EmbeddingModel:
|
||||||
file_path = os.path.join(folder_path, filename)
|
file_path = os.path.join(folder_path, filename)
|
||||||
with open(file_path, 'r') as file:
|
with open(file_path, 'r') as file:
|
||||||
file_contents = file.read()
|
file_contents = file.read()
|
||||||
|
file_contents = '\n'.join(self.remove_duplicate(file_contents.split('\n')))
|
||||||
contents_lst = [str.replace('\n',' ').replace('\xa0', ' ') for str in file_contents.split('\n\n')]
|
contents_lst = [str.replace('\n',' ').replace('\xa0', ' ') for str in file_contents.split('\n\n')]
|
||||||
|
contents_lst = self.remove_footer(contents_lst)
|
||||||
if len(contents_lst) < 3: # contents_lst[0] is the URL, contents_lst[1] is the title, the rest is the content
|
if len(contents_lst) < 3: # contents_lst[0] is the URL, contents_lst[1] is the title, the rest is the content
|
||||||
continue
|
continue
|
||||||
url = contents_lst[0]
|
url = contents_lst[0]
|
||||||
if '?' in url: # URLs with a '?' corresponds to call to services and have no useful content
|
if '?' in url: # URLs with a '?' corresponds to call to services and have no useful content
|
||||||
continue
|
continue
|
||||||
|
if not url.startswith('https://www.caisse-epargne.fr/rhone-alpes/'):
|
||||||
|
continue
|
||||||
title = contents_lst[1]
|
title = contents_lst[1]
|
||||||
if not title: # when the title is absent (or empty), the page has no interest
|
if not title: # when the title is absent (or empty), the page has no interest
|
||||||
continue
|
continue
|
||||||
|
|
2
rag.py
2
rag.py
|
@ -127,7 +127,7 @@ Question reformulée : "
|
||||||
|
|
||||||
def answer(self, prompt, stream):
|
def answer(self, prompt, stream):
|
||||||
response = self.llm(prompt = prompt,
|
response = self.llm(prompt = prompt,
|
||||||
temperature = 0.7,
|
temperature = 0.1,
|
||||||
mirostat_mode = 2,
|
mirostat_mode = 2,
|
||||||
stream = stream,
|
stream = stream,
|
||||||
max_tokens = -1,
|
max_tokens = -1,
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user