new cleaning steps during the process of creating the embeddings

2024-01-07 20:31:08 +01:00 · 2024-01-07 20:31:08 +01:00 · c41ffcd5b9
commit c41ffcd5b9
parent ad9e7d93aa
3 changed files with 2822 additions and 42 deletions
--- a/embedding.py
+++ b/embedding.py
@ -148,6 +148,29 @@ class EmbeddingModel:
        self.collection.add(embeddings=embeddings, documents=documents, metadatas=metadatas, ids=ids)
    def remove_duplicate(self, lst):
        # file_contents can contain duplicate lines
        # because we keep the textual content of multiple html tags that can be embedded one in another
        i = 0
        while i < len(lst) - 1:
            if i < len(lst) - 3 and lst[i] == lst[i + 2] and lst[i + 1] == lst[i + 3] == '':
                # Remove lst[i+1], lst[i+2], and lst[i+3]
                del lst[i + 1:i + 3]
            elif lst[i] == lst[i + 1]:
                # Remove lst[i+1]
                del lst[i + 1]
            else:
                i += 1
        return lst
    def remove_footer(self, lst):
        sequence = ["Caisse d'Epargne", "Rhône Alpes", "Formuler une demande en ligne"]
        for i in range(len(lst) - 2):
            if lst[i:i + 3] == sequence:
                del lst[i:]
                break
        return lst
    def embed_folder(self, folder_path):
        """
        Embeds all the .txt files within a specified folder into a ChromaDB collection using a specified embedding model.
@ -167,12 +190,16 @@ class EmbeddingModel:
                file_path = os.path.join(folder_path, filename)
                with open(file_path, 'r') as file:
                    file_contents = file.read()
                file_contents = '\n'.join(self.remove_duplicate(file_contents.split('\n')))
                contents_lst = [str.replace('\n',' ').replace('\xa0', ' ') for str in file_contents.split('\n\n')]
                contents_lst = self.remove_footer(contents_lst)
                if len(contents_lst) < 3: # contents_lst[0] is the URL, contents_lst[1] is the title, the rest is the content
                    continue
                url = contents_lst[0]
                if '?' in url: # URLs with a '?' corresponds to call to services and have no useful content
                    continue
                if not url.startswith('https://www.caisse-epargne.fr/rhone-alpes/'):
                    continue
                title = contents_lst[1]
                if not title: # when the title is absent (or empty), the page has no interest
                    continue
--- a/rag.py
+++ b/rag.py
@ -127,7 +127,7 @@ Question reformulée : "
    def answer(self, prompt, stream):
        response = self.llm(prompt = prompt,
-            temperature = 0.7,
+            temperature = 0.1,
            mirostat_mode = 2,
            stream = stream,
            max_tokens = -1,
--- a/rag_fr_embedding.ipynb
+++ b/rag_fr_embedding.ipynb