rag/rag_fr_embedding_test.ipynb

253 lines
11 KiB
Plaintext
Raw Normal View History

2024-01-10 14:06:42 +00:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d8acc709-ebb2-4fa6-982b-3d13fe8d2beb",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/peportier/miniforge3/envs/RAG_ENV/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"/Users/peportier/miniforge3/envs/RAG_ENV/lib/python3.9/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
" _torch_pytree._register_pytree_node(\n"
]
}
],
"source": [
"from bs4 import BeautifulSoup\n",
"import base64\n",
"import re\n",
"from transformers import AutoTokenizer\n",
"import logging\n",
"import os\n",
"from IPython.display import Markdown, display"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "54f5ab50-2ee3-45ad-9208-c1e2dc362152",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer\n",
"model_name = 'intfloat/multilingual-e5-large'\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8c9e6f4e-609d-488d-a738-41934a62e92a",
"metadata": {},
"outputs": [],
"source": [
"def token_length(text):\n",
" return len(tokenizer.encode(text, add_special_tokens=False))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "8deef599-04b0-4d9f-9b3e-ac9ae5a472a0",
"metadata": {},
"outputs": [],
"source": [
"def passage_str(paragraphs, title, subtitle):\n",
" return f\"passage: {title}\\n\\n{subtitle}\\n\\n\" + '\\n'.join(paragraphs)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "1ef97436-37c2-45b4-8e00-7737d87c261e",
"metadata": {},
"outputs": [],
"source": [
"html_folder_path = '../scrapcera/htmls/'\n",
"txt_folder_path = '../scrapcera/docs/'\n",
"html_filename = '97e88fd1d6.html'"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "560280af-ad79-43e8-b4df-c4f69aa40dcf",
"metadata": {},
"outputs": [],
"source": [
"for html_filename in ['f6d921ced8.html']: # os.listdir(html_folder_path):\n",
" \n",
" html_file_path = os.path.join(html_folder_path, html_filename)\n",
" txt_filename = re.sub(r'\\.html', '.txt', html_filename)\n",
" txt_file_path = os.path.join(txt_folder_path, txt_filename)\n",
" with open(txt_file_path, 'r') as file:\n",
" txt_file_contents = file.read()\n",
" \n",
" url = txt_file_contents.split('\\n')[0]\n",
" if '?' in url: # URLs with a '?' corresponds to call to services and have no useful content\n",
" continue\n",
" if not url.startswith('https://www.caisse-epargne.fr/rhone-alpes/'):\n",
" continue\n",
" \n",
" prefix = 'https://www.caisse-epargne.fr/'\n",
" suffix = url.replace(prefix, '')\n",
" tags = suffix.split('/')\n",
" tags = [tag for tag in tags if tag] # remove empty par\n",
" with open(html_file_path, 'r') as file:\n",
" html_file_contents = file.read()\n",
" soup = BeautifulSoup(html_file_contents, 'html.parser')\n",
" page_title_present = soup.find('section').find('h1')\n",
" if not page_title_present:\n",
" continue\n",
" page_title = page_title_present.get_text()\n",
" \n",
" sections = soup.find_all(lambda tag: tag.name in ['section'] and 'key-informations' not in tag.get('class', []))\n",
" \n",
" struct_page = {'title': page_title}\n",
" current_section = ''\n",
" for section in sections:\n",
" for wysiwyg_tag in section.find_all(class_=\"wysiwyg\"):\n",
" # Check for a title within the wysiwyg container\n",
" internal_title = wysiwyg_tag.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) or wysiwyg_tag.find('p', class_='title')\n",
" \n",
" # If no internal title, find the nearest title from previous siblings\n",
" if not internal_title:\n",
" # Find the nearest title from previous siblings\n",
" nearest_title = None\n",
" for sibling in wysiwyg_tag.find_previous_siblings():\n",
" if sibling.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:\n",
" nearest_title = sibling.get_text().strip()\n",
" break\n",
" if sibling.name == 'p' and 'title' in sibling.get('class', []):\n",
" nearest_title = sibling.get_text().strip()\n",
" break\n",
" if nearest_title:\n",
" nearest_title = re.sub(r'\\(\\d\\)', '', nearest_title)\n",
" nearest_title = re.sub(r'^\\d+\\.\\s*', '', nearest_title)\n",
" current_section = nearest_title\n",
" struct_page[current_section] = []\n",
" else:\n",
" continue\n",
" for child in wysiwyg_tag.find_all(['p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):\n",
" text = child.get_text().strip()\n",
" text = re.sub(r'\\(\\d\\)', '', text)\n",
" if child.name.startswith('h') or (child.name == 'p' and 'title' in child.get('class', [])):\n",
" text = re.sub(r'^\\d+\\.\\s*', '', text)\n",
" current_section = text\n",
" struct_page[current_section] = []\n",
" else: # <p> not of class title, or <li>\n",
" if 'is-style-mentions' not in child.get('class', []):\n",
" if current_section in struct_page:\n",
" struct_page[current_section].append(text)\n",
"\n",
" # detect_big_chunks(struct_page, html_filename)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e6da54d7-6c70-44eb-b08b-392c742d0525",
"metadata": {},
"outputs": [],
"source": [
"chunks_length = []\n",
"def detect_big_chunks(struct_page, filename):\n",
" global big_chunks_length\n",
" max_chunk_size=512\n",
" title = struct_page['title']\n",
" for subtitle, paragraphs in struct_page.items():\n",
" if subtitle != 'title':\n",
" doc_str = passage_str(paragraphs, title, subtitle)\n",
" doc_token_length = token_length(doc_str)\n",
" if doc_token_length > max_chunk_size:\n",
" sub_paragraphs = []\n",
" sub_paragraphs_token_length = 0\n",
" paragraph_index = 0\n",
" while True:\n",
" while sub_paragraphs_token_length < max_chunk_size and paragraph_index < len(paragraphs):\n",
" sub_paragraphs.append(paragraphs[paragraph_index])\n",
" sub_paragraphs_str = passage_str(sub_paragraphs, title, subtitle)\n",
" sub_paragraphs_token_length = token_length(sub_paragraphs_str)\n",
" paragraph_index += 1\n",
" if paragraph_index == len(paragraphs):\n",
" if sub_paragraphs_token_length >= max_chunk_size:\n",
" sub_paragraphs_str_1 = passage_str(sub_paragraphs[:-1], title, subtitle)\n",
" sub_paragraphs_str_2 = passage_str([sub_paragraphs[-1]], title, subtitle)\n",
" chunks_length.append(len(sub_paragraphs_str_1))\n",
" chunks_length.append(len(sub_paragraphs_str_2))\n",
" else:\n",
" sub_paragraphs_str = passage_str(sub_paragraphs, title, subtitle)\n",
" chunks_length.append(len(sub_paragraphs_str))\n",
" break\n",
" else:\n",
" sub_paragraphs_str = passage_str(sub_paragraphs[:-1], title, subtitle)\n",
" chunks_length.append(len(sub_paragraphs_str))\n",
" paragraph_index -= 1\n",
" sub_paragraphs = []\n",
" sub_paragraphs_token_length = 0\n",
" \n",
" chunks_length.append(len(doc_str))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "8a534ec5-a85a-41bf-b229-c896612cec42",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'title': 'Devenez sociétaire !',\n",
" 'Qui peut devenir sociétaire ?': ['Tous les clients de la Caisse dEpargne peuvent souscrire des parts sociales : particuliers, personnes morales (associations, entreprises), EPCI (Établissements Publics de Coopération Intercommunale) à fiscalité propre. Les collectivités territoriales peuvent également devenir sociétaires.'],\n",
" 'Comment devenir sociétaire\\xa0?': ['Vous souscrivez vos parts sociales de la Société Locale dEpargne (SLE) auprès de lagence où est domicilié votre compte principal. Pour tout renseignement, contactez votre conseiller, il saura vous orienter.'],\n",
" 'Le site sociétaires': ['Sur www.societaires.caisse-epargne.fr, vous disposez dun site dinformation et davantages sélectionnés pour vous. Vous y découvrirez les réalisations et engagements de votre Caisse dEpargne sur votre territoire : actualité, partenariats, soutien aux actions sociétales…',\n",
" 'Cest aussi une source incontournable dinformations sur lorganisation et les valeurs coopératives, les assemblées générales, la vie du sociétariat et des sociétés locales dépargne.']}"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"struct_page"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fdd70455-c279-4b08-87e9-d42c5c093bc6",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "RAG_ENV",
"language": "python",
"name": "rag_env"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}