rag/rag_fr_embedding_test.ipynb

253 lines
11 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d8acc709-ebb2-4fa6-982b-3d13fe8d2beb",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/peportier/miniforge3/envs/RAG_ENV/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"/Users/peportier/miniforge3/envs/RAG_ENV/lib/python3.9/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
" _torch_pytree._register_pytree_node(\n"
]
}
],
"source": [
"from bs4 import BeautifulSoup\n",
"import base64\n",
"import re\n",
"from transformers import AutoTokenizer\n",
"import logging\n",
"import os\n",
"from IPython.display import Markdown, display"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "54f5ab50-2ee3-45ad-9208-c1e2dc362152",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer\n",
"model_name = 'intfloat/multilingual-e5-large'\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8c9e6f4e-609d-488d-a738-41934a62e92a",
"metadata": {},
"outputs": [],
"source": [
"def token_length(text):\n",
" return len(tokenizer.encode(text, add_special_tokens=False))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "8deef599-04b0-4d9f-9b3e-ac9ae5a472a0",
"metadata": {},
"outputs": [],
"source": [
"def passage_str(paragraphs, title, subtitle):\n",
" return f\"passage: {title}\\n\\n{subtitle}\\n\\n\" + '\\n'.join(paragraphs)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "1ef97436-37c2-45b4-8e00-7737d87c261e",
"metadata": {},
"outputs": [],
"source": [
"html_folder_path = '../scrapcera/htmls/'\n",
"txt_folder_path = '../scrapcera/docs/'\n",
"html_filename = '97e88fd1d6.html'"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "560280af-ad79-43e8-b4df-c4f69aa40dcf",
"metadata": {},
"outputs": [],
"source": [
"for html_filename in ['f6d921ced8.html']: # os.listdir(html_folder_path):\n",
" \n",
" html_file_path = os.path.join(html_folder_path, html_filename)\n",
" txt_filename = re.sub(r'\\.html', '.txt', html_filename)\n",
" txt_file_path = os.path.join(txt_folder_path, txt_filename)\n",
" with open(txt_file_path, 'r') as file:\n",
" txt_file_contents = file.read()\n",
" \n",
" url = txt_file_contents.split('\\n')[0]\n",
" if '?' in url: # URLs with a '?' corresponds to call to services and have no useful content\n",
" continue\n",
" if not url.startswith('https://www.caisse-epargne.fr/rhone-alpes/'):\n",
" continue\n",
" \n",
" prefix = 'https://www.caisse-epargne.fr/'\n",
" suffix = url.replace(prefix, '')\n",
" tags = suffix.split('/')\n",
" tags = [tag for tag in tags if tag] # remove empty par\n",
" with open(html_file_path, 'r') as file:\n",
" html_file_contents = file.read()\n",
" soup = BeautifulSoup(html_file_contents, 'html.parser')\n",
" page_title_present = soup.find('section').find('h1')\n",
" if not page_title_present:\n",
" continue\n",
" page_title = page_title_present.get_text()\n",
" \n",
" sections = soup.find_all(lambda tag: tag.name in ['section'] and 'key-informations' not in tag.get('class', []))\n",
" \n",
" struct_page = {'title': page_title}\n",
" current_section = ''\n",
" for section in sections:\n",
" for wysiwyg_tag in section.find_all(class_=\"wysiwyg\"):\n",
" # Check for a title within the wysiwyg container\n",
" internal_title = wysiwyg_tag.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) or wysiwyg_tag.find('p', class_='title')\n",
" \n",
" # If no internal title, find the nearest title from previous siblings\n",
" if not internal_title:\n",
" # Find the nearest title from previous siblings\n",
" nearest_title = None\n",
" for sibling in wysiwyg_tag.find_previous_siblings():\n",
" if sibling.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:\n",
" nearest_title = sibling.get_text().strip()\n",
" break\n",
" if sibling.name == 'p' and 'title' in sibling.get('class', []):\n",
" nearest_title = sibling.get_text().strip()\n",
" break\n",
" if nearest_title:\n",
" nearest_title = re.sub(r'\\(\\d\\)', '', nearest_title)\n",
" nearest_title = re.sub(r'^\\d+\\.\\s*', '', nearest_title)\n",
" current_section = nearest_title\n",
" struct_page[current_section] = []\n",
" else:\n",
" continue\n",
" for child in wysiwyg_tag.find_all(['p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):\n",
" text = child.get_text().strip()\n",
" text = re.sub(r'\\(\\d\\)', '', text)\n",
" if child.name.startswith('h') or (child.name == 'p' and 'title' in child.get('class', [])):\n",
" text = re.sub(r'^\\d+\\.\\s*', '', text)\n",
" current_section = text\n",
" struct_page[current_section] = []\n",
" else: # <p> not of class title, or <li>\n",
" if 'is-style-mentions' not in child.get('class', []):\n",
" if current_section in struct_page:\n",
" struct_page[current_section].append(text)\n",
"\n",
" # detect_big_chunks(struct_page, html_filename)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e6da54d7-6c70-44eb-b08b-392c742d0525",
"metadata": {},
"outputs": [],
"source": [
"chunks_length = []\n",
"def detect_big_chunks(struct_page, filename):\n",
" global big_chunks_length\n",
" max_chunk_size=512\n",
" title = struct_page['title']\n",
" for subtitle, paragraphs in struct_page.items():\n",
" if subtitle != 'title':\n",
" doc_str = passage_str(paragraphs, title, subtitle)\n",
" doc_token_length = token_length(doc_str)\n",
" if doc_token_length > max_chunk_size:\n",
" sub_paragraphs = []\n",
" sub_paragraphs_token_length = 0\n",
" paragraph_index = 0\n",
" while True:\n",
" while sub_paragraphs_token_length < max_chunk_size and paragraph_index < len(paragraphs):\n",
" sub_paragraphs.append(paragraphs[paragraph_index])\n",
" sub_paragraphs_str = passage_str(sub_paragraphs, title, subtitle)\n",
" sub_paragraphs_token_length = token_length(sub_paragraphs_str)\n",
" paragraph_index += 1\n",
" if paragraph_index == len(paragraphs):\n",
" if sub_paragraphs_token_length >= max_chunk_size:\n",
" sub_paragraphs_str_1 = passage_str(sub_paragraphs[:-1], title, subtitle)\n",
" sub_paragraphs_str_2 = passage_str([sub_paragraphs[-1]], title, subtitle)\n",
" chunks_length.append(len(sub_paragraphs_str_1))\n",
" chunks_length.append(len(sub_paragraphs_str_2))\n",
" else:\n",
" sub_paragraphs_str = passage_str(sub_paragraphs, title, subtitle)\n",
" chunks_length.append(len(sub_paragraphs_str))\n",
" break\n",
" else:\n",
" sub_paragraphs_str = passage_str(sub_paragraphs[:-1], title, subtitle)\n",
" chunks_length.append(len(sub_paragraphs_str))\n",
" paragraph_index -= 1\n",
" sub_paragraphs = []\n",
" sub_paragraphs_token_length = 0\n",
" \n",
" chunks_length.append(len(doc_str))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "8a534ec5-a85a-41bf-b229-c896612cec42",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'title': 'Devenez sociétaire !',\n",
" 'Qui peut devenir sociétaire ?': ['Tous les clients de la Caisse dEpargne peuvent souscrire des parts sociales : particuliers, personnes morales (associations, entreprises), EPCI (Établissements Publics de Coopération Intercommunale) à fiscalité propre. Les collectivités territoriales peuvent également devenir sociétaires.'],\n",
" 'Comment devenir sociétaire\\xa0?': ['Vous souscrivez vos parts sociales de la Société Locale dEpargne (SLE) auprès de lagence où est domicilié votre compte principal. Pour tout renseignement, contactez votre conseiller, il saura vous orienter.'],\n",
" 'Le site sociétaires': ['Sur www.societaires.caisse-epargne.fr, vous disposez dun site dinformation et davantages sélectionnés pour vous. Vous y découvrirez les réalisations et engagements de votre Caisse dEpargne sur votre territoire : actualité, partenariats, soutien aux actions sociétales…',\n",
" 'Cest aussi une source incontournable dinformations sur lorganisation et les valeurs coopératives, les assemblées générales, la vie du sociétariat et des sociétés locales dépargne.']}"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"struct_page"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fdd70455-c279-4b08-87e9-d42c5c093bc6",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "RAG_ENV",
"language": "python",
"name": "rag_env"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}