new cleaning steps during the process of creating the embeddings

2024-01-07 20:31:08 +01:00 · 2024-01-07 20:31:08 +01:00 · c41ffcd5b9
commit c41ffcd5b9
parent ad9e7d93aa
3 changed files with 2822 additions and 42 deletions
--- a/embedding.py
+++ b/embedding.py
@ -148,6 +148,29 @@ class EmbeddingModel:

        self.collection.add(embeddings=embeddings, documents=documents, metadatas=metadatas, ids=ids)

+    def remove_duplicate(self, lst):
+        # file_contents can contain duplicate lines
+        # because we keep the textual content of multiple html tags that can be embedded one in another
+        i = 0
+        while i < len(lst) - 1:
+            if i < len(lst) - 3 and lst[i] == lst[i + 2] and lst[i + 1] == lst[i + 3] == '':
+                # Remove lst[i+1], lst[i+2], and lst[i+3]
+                del lst[i + 1:i + 3]
+            elif lst[i] == lst[i + 1]:
+                # Remove lst[i+1]
+                del lst[i + 1]
+            else:
+                i += 1
+        return lst
+    
+    def remove_footer(self, lst):
+        sequence = ["Caisse d'Epargne", "Rhône Alpes", "Formuler une demande en ligne"]
+        for i in range(len(lst) - 2):
+            if lst[i:i + 3] == sequence:
+                del lst[i:]
+                break
+        return lst
+
    def embed_folder(self, folder_path):
        """
        Embeds all the .txt files within a specified folder into a ChromaDB collection using a specified embedding model.
@ -167,12 +190,16 @@ class EmbeddingModel:
                file_path = os.path.join(folder_path, filename)
                with open(file_path, 'r') as file:
                    file_contents = file.read()
+                file_contents = '\n'.join(self.remove_duplicate(file_contents.split('\n')))
                contents_lst = [str.replace('\n',' ').replace('\xa0', ' ') for str in file_contents.split('\n\n')]
+                contents_lst = self.remove_footer(contents_lst)
                if len(contents_lst) < 3: # contents_lst[0] is the URL, contents_lst[1] is the title, the rest is the content
                    continue
                url = contents_lst[0]
                if '?' in url: # URLs with a '?' corresponds to call to services and have no useful content
                    continue
+                if not url.startswith('https://www.caisse-epargne.fr/rhone-alpes/'):
+                    continue
                title = contents_lst[1]
                if not title: # when the title is absent (or empty), the page has no interest
                    continue
--- a/rag.py
+++ b/rag.py
@ -127,7 +127,7 @@ Question reformulée : "

    def answer(self, prompt, stream):
        response = self.llm(prompt = prompt,
-            temperature = 0.7,
+            temperature = 0.1,
            mirostat_mode = 2,
            stream = stream,
            max_tokens = -1,
--- a/rag_fr_embedding.ipynb
+++ b/rag_fr_embedding.ipynb