{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "d8acc709-ebb2-4fa6-982b-3d13fe8d2beb", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/peportier/miniforge3/envs/RAG_ENV/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "/Users/peportier/miniforge3/envs/RAG_ENV/lib/python3.9/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", " _torch_pytree._register_pytree_node(\n" ] } ], "source": [ "from bs4 import BeautifulSoup\n", "import base64\n", "import re\n", "from transformers import AutoTokenizer\n", "import logging\n", "import os\n", "from IPython.display import Markdown, display" ] }, { "cell_type": "code", "execution_count": 2, "id": "54f5ab50-2ee3-45ad-9208-c1e2dc362152", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "model_name = 'intfloat/multilingual-e5-large'\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)" ] }, { "cell_type": "code", "execution_count": 3, "id": "8c9e6f4e-609d-488d-a738-41934a62e92a", "metadata": {}, "outputs": [], "source": [ "def token_length(text):\n", " return len(tokenizer.encode(text, add_special_tokens=False))" ] }, { "cell_type": "code", "execution_count": 4, "id": "8deef599-04b0-4d9f-9b3e-ac9ae5a472a0", "metadata": {}, "outputs": [], "source": [ "def passage_str(paragraphs, title, subtitle):\n", " return f\"passage: {title}\\n\\n{subtitle}\\n\\n\" + '\\n'.join(paragraphs)" ] }, { "cell_type": "code", "execution_count": 5, "id": "1ef97436-37c2-45b4-8e00-7737d87c261e", "metadata": {}, "outputs": [], "source": [ "html_folder_path = '../scrapcera/htmls/'\n", "txt_folder_path = '../scrapcera/docs/'\n", "html_filename = '97e88fd1d6.html'" ] }, { "cell_type": "code", "execution_count": 8, "id": "560280af-ad79-43e8-b4df-c4f69aa40dcf", "metadata": {}, "outputs": [], "source": [ "for html_filename in ['f6d921ced8.html']: # os.listdir(html_folder_path):\n", " \n", " html_file_path = os.path.join(html_folder_path, html_filename)\n", " txt_filename = re.sub(r'\\.html', '.txt', html_filename)\n", " txt_file_path = os.path.join(txt_folder_path, txt_filename)\n", " with open(txt_file_path, 'r') as file:\n", " txt_file_contents = file.read()\n", " \n", " url = txt_file_contents.split('\\n')[0]\n", " if '?' in url: # URLs with a '?' corresponds to call to services and have no useful content\n", " continue\n", " if not url.startswith('https://www.caisse-epargne.fr/rhone-alpes/'):\n", " continue\n", " \n", " prefix = 'https://www.caisse-epargne.fr/'\n", " suffix = url.replace(prefix, '')\n", " tags = suffix.split('/')\n", " tags = [tag for tag in tags if tag] # remove empty par\n", " with open(html_file_path, 'r') as file:\n", " html_file_contents = file.read()\n", " soup = BeautifulSoup(html_file_contents, 'html.parser')\n", " page_title_present = soup.find('section').find('h1')\n", " if not page_title_present:\n", " continue\n", " page_title = page_title_present.get_text()\n", " \n", " sections = soup.find_all(lambda tag: tag.name in ['section'] and 'key-informations' not in tag.get('class', []))\n", " \n", " struct_page = {'title': page_title}\n", " current_section = ''\n", " for section in sections:\n", " for wysiwyg_tag in section.find_all(class_=\"wysiwyg\"):\n", " # Check for a title within the wysiwyg container\n", " internal_title = wysiwyg_tag.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) or wysiwyg_tag.find('p', class_='title')\n", " \n", " # If no internal title, find the nearest title from previous siblings\n", " if not internal_title:\n", " # Find the nearest title from previous siblings\n", " nearest_title = None\n", " for sibling in wysiwyg_tag.find_previous_siblings():\n", " if sibling.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:\n", " nearest_title = sibling.get_text().strip()\n", " break\n", " if sibling.name == 'p' and 'title' in sibling.get('class', []):\n", " nearest_title = sibling.get_text().strip()\n", " break\n", " if nearest_title:\n", " nearest_title = re.sub(r'\\(\\d\\)', '', nearest_title)\n", " nearest_title = re.sub(r'^\\d+\\.\\s*', '', nearest_title)\n", " current_section = nearest_title\n", " struct_page[current_section] = []\n", " else:\n", " continue\n", " for child in wysiwyg_tag.find_all(['p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):\n", " text = child.get_text().strip()\n", " text = re.sub(r'\\(\\d\\)', '', text)\n", " if child.name.startswith('h') or (child.name == 'p' and 'title' in child.get('class', [])):\n", " text = re.sub(r'^\\d+\\.\\s*', '', text)\n", " current_section = text\n", " struct_page[current_section] = []\n", " else: #

not of class title, or

  • \n", " if 'is-style-mentions' not in child.get('class', []):\n", " if current_section in struct_page:\n", " struct_page[current_section].append(text)\n", "\n", " # detect_big_chunks(struct_page, html_filename)" ] }, { "cell_type": "code", "execution_count": 7, "id": "e6da54d7-6c70-44eb-b08b-392c742d0525", "metadata": {}, "outputs": [], "source": [ "chunks_length = []\n", "def detect_big_chunks(struct_page, filename):\n", " global big_chunks_length\n", " max_chunk_size=512\n", " title = struct_page['title']\n", " for subtitle, paragraphs in struct_page.items():\n", " if subtitle != 'title':\n", " doc_str = passage_str(paragraphs, title, subtitle)\n", " doc_token_length = token_length(doc_str)\n", " if doc_token_length > max_chunk_size:\n", " sub_paragraphs = []\n", " sub_paragraphs_token_length = 0\n", " paragraph_index = 0\n", " while True:\n", " while sub_paragraphs_token_length < max_chunk_size and paragraph_index < len(paragraphs):\n", " sub_paragraphs.append(paragraphs[paragraph_index])\n", " sub_paragraphs_str = passage_str(sub_paragraphs, title, subtitle)\n", " sub_paragraphs_token_length = token_length(sub_paragraphs_str)\n", " paragraph_index += 1\n", " if paragraph_index == len(paragraphs):\n", " if sub_paragraphs_token_length >= max_chunk_size:\n", " sub_paragraphs_str_1 = passage_str(sub_paragraphs[:-1], title, subtitle)\n", " sub_paragraphs_str_2 = passage_str([sub_paragraphs[-1]], title, subtitle)\n", " chunks_length.append(len(sub_paragraphs_str_1))\n", " chunks_length.append(len(sub_paragraphs_str_2))\n", " else:\n", " sub_paragraphs_str = passage_str(sub_paragraphs, title, subtitle)\n", " chunks_length.append(len(sub_paragraphs_str))\n", " break\n", " else:\n", " sub_paragraphs_str = passage_str(sub_paragraphs[:-1], title, subtitle)\n", " chunks_length.append(len(sub_paragraphs_str))\n", " paragraph_index -= 1\n", " sub_paragraphs = []\n", " sub_paragraphs_token_length = 0\n", " \n", " chunks_length.append(len(doc_str))" ] }, { "cell_type": "code", "execution_count": 9, "id": "8a534ec5-a85a-41bf-b229-c896612cec42", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'title': 'Devenez sociétaire !',\n", " 'Qui peut devenir sociétaire ?': ['Tous les clients de la Caisse d’Epargne peuvent souscrire des parts sociales : particuliers, personnes morales (associations, entreprises), EPCI (Établissements Publics de Coopération Intercommunale) à fiscalité propre. Les collectivités territoriales peuvent également devenir sociétaires.'],\n", " 'Comment devenir sociétaire\\xa0?': ['Vous souscrivez vos parts sociales de la Société Locale d’Epargne (SLE) auprès de l’agence où est domicilié votre compte principal. Pour tout renseignement, contactez votre conseiller, il saura vous orienter.'],\n", " 'Le site sociétaires': ['Sur www.societaires.caisse-epargne.fr, vous disposez d’un site d’information et d’avantages sélectionnés pour vous. Vous y découvrirez les réalisations et engagements de votre Caisse d’Epargne sur votre territoire : actualité, partenariats, soutien aux actions sociétales…',\n", " 'C’est aussi une source incontournable d’informations sur l’organisation et les valeurs coopératives, les assemblées générales, la vie du sociétariat et des sociétés locales d’épargne.']}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "struct_page" ] }, { "cell_type": "code", "execution_count": null, "id": "fdd70455-c279-4b08-87e9-d42c5c093bc6", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "RAG_ENV", "language": "python", "name": "rag_env" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 5 }