253 lines
11 KiB
Plaintext
253 lines
11 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "d8acc709-ebb2-4fa6-982b-3d13fe8d2beb",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/Users/peportier/miniforge3/envs/RAG_ENV/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||
" from .autonotebook import tqdm as notebook_tqdm\n",
|
||
"/Users/peportier/miniforge3/envs/RAG_ENV/lib/python3.9/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
|
||
" _torch_pytree._register_pytree_node(\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from bs4 import BeautifulSoup\n",
|
||
"import base64\n",
|
||
"import re\n",
|
||
"from transformers import AutoTokenizer\n",
|
||
"import logging\n",
|
||
"import os\n",
|
||
"from IPython.display import Markdown, display"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "54f5ab50-2ee3-45ad-9208-c1e2dc362152",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from transformers import AutoTokenizer\n",
|
||
"model_name = 'intfloat/multilingual-e5-large'\n",
|
||
"tokenizer = AutoTokenizer.from_pretrained(model_name)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "8c9e6f4e-609d-488d-a738-41934a62e92a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def token_length(text):\n",
|
||
" return len(tokenizer.encode(text, add_special_tokens=False))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "8deef599-04b0-4d9f-9b3e-ac9ae5a472a0",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def passage_str(paragraphs, title, subtitle):\n",
|
||
" return f\"passage: {title}\\n\\n{subtitle}\\n\\n\" + '\\n'.join(paragraphs)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "1ef97436-37c2-45b4-8e00-7737d87c261e",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"html_folder_path = '../scrapcera/htmls/'\n",
|
||
"txt_folder_path = '../scrapcera/docs/'\n",
|
||
"html_filename = '97e88fd1d6.html'"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "560280af-ad79-43e8-b4df-c4f69aa40dcf",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"for html_filename in ['f6d921ced8.html']: # os.listdir(html_folder_path):\n",
|
||
" \n",
|
||
" html_file_path = os.path.join(html_folder_path, html_filename)\n",
|
||
" txt_filename = re.sub(r'\\.html', '.txt', html_filename)\n",
|
||
" txt_file_path = os.path.join(txt_folder_path, txt_filename)\n",
|
||
" with open(txt_file_path, 'r') as file:\n",
|
||
" txt_file_contents = file.read()\n",
|
||
" \n",
|
||
" url = txt_file_contents.split('\\n')[0]\n",
|
||
" if '?' in url: # URLs with a '?' corresponds to call to services and have no useful content\n",
|
||
" continue\n",
|
||
" if not url.startswith('https://www.caisse-epargne.fr/rhone-alpes/'):\n",
|
||
" continue\n",
|
||
" \n",
|
||
" prefix = 'https://www.caisse-epargne.fr/'\n",
|
||
" suffix = url.replace(prefix, '')\n",
|
||
" tags = suffix.split('/')\n",
|
||
" tags = [tag for tag in tags if tag] # remove empty par\n",
|
||
" with open(html_file_path, 'r') as file:\n",
|
||
" html_file_contents = file.read()\n",
|
||
" soup = BeautifulSoup(html_file_contents, 'html.parser')\n",
|
||
" page_title_present = soup.find('section').find('h1')\n",
|
||
" if not page_title_present:\n",
|
||
" continue\n",
|
||
" page_title = page_title_present.get_text()\n",
|
||
" \n",
|
||
" sections = soup.find_all(lambda tag: tag.name in ['section'] and 'key-informations' not in tag.get('class', []))\n",
|
||
" \n",
|
||
" struct_page = {'title': page_title}\n",
|
||
" current_section = ''\n",
|
||
" for section in sections:\n",
|
||
" for wysiwyg_tag in section.find_all(class_=\"wysiwyg\"):\n",
|
||
" # Check for a title within the wysiwyg container\n",
|
||
" internal_title = wysiwyg_tag.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) or wysiwyg_tag.find('p', class_='title')\n",
|
||
" \n",
|
||
" # If no internal title, find the nearest title from previous siblings\n",
|
||
" if not internal_title:\n",
|
||
" # Find the nearest title from previous siblings\n",
|
||
" nearest_title = None\n",
|
||
" for sibling in wysiwyg_tag.find_previous_siblings():\n",
|
||
" if sibling.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:\n",
|
||
" nearest_title = sibling.get_text().strip()\n",
|
||
" break\n",
|
||
" if sibling.name == 'p' and 'title' in sibling.get('class', []):\n",
|
||
" nearest_title = sibling.get_text().strip()\n",
|
||
" break\n",
|
||
" if nearest_title:\n",
|
||
" nearest_title = re.sub(r'\\(\\d\\)', '', nearest_title)\n",
|
||
" nearest_title = re.sub(r'^\\d+\\.\\s*', '', nearest_title)\n",
|
||
" current_section = nearest_title\n",
|
||
" struct_page[current_section] = []\n",
|
||
" else:\n",
|
||
" continue\n",
|
||
" for child in wysiwyg_tag.find_all(['p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):\n",
|
||
" text = child.get_text().strip()\n",
|
||
" text = re.sub(r'\\(\\d\\)', '', text)\n",
|
||
" if child.name.startswith('h') or (child.name == 'p' and 'title' in child.get('class', [])):\n",
|
||
" text = re.sub(r'^\\d+\\.\\s*', '', text)\n",
|
||
" current_section = text\n",
|
||
" struct_page[current_section] = []\n",
|
||
" else: # <p> not of class title, or <li>\n",
|
||
" if 'is-style-mentions' not in child.get('class', []):\n",
|
||
" if current_section in struct_page:\n",
|
||
" struct_page[current_section].append(text)\n",
|
||
"\n",
|
||
" # detect_big_chunks(struct_page, html_filename)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "e6da54d7-6c70-44eb-b08b-392c742d0525",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"chunks_length = []\n",
|
||
"def detect_big_chunks(struct_page, filename):\n",
|
||
" global big_chunks_length\n",
|
||
" max_chunk_size=512\n",
|
||
" title = struct_page['title']\n",
|
||
" for subtitle, paragraphs in struct_page.items():\n",
|
||
" if subtitle != 'title':\n",
|
||
" doc_str = passage_str(paragraphs, title, subtitle)\n",
|
||
" doc_token_length = token_length(doc_str)\n",
|
||
" if doc_token_length > max_chunk_size:\n",
|
||
" sub_paragraphs = []\n",
|
||
" sub_paragraphs_token_length = 0\n",
|
||
" paragraph_index = 0\n",
|
||
" while True:\n",
|
||
" while sub_paragraphs_token_length < max_chunk_size and paragraph_index < len(paragraphs):\n",
|
||
" sub_paragraphs.append(paragraphs[paragraph_index])\n",
|
||
" sub_paragraphs_str = passage_str(sub_paragraphs, title, subtitle)\n",
|
||
" sub_paragraphs_token_length = token_length(sub_paragraphs_str)\n",
|
||
" paragraph_index += 1\n",
|
||
" if paragraph_index == len(paragraphs):\n",
|
||
" if sub_paragraphs_token_length >= max_chunk_size:\n",
|
||
" sub_paragraphs_str_1 = passage_str(sub_paragraphs[:-1], title, subtitle)\n",
|
||
" sub_paragraphs_str_2 = passage_str([sub_paragraphs[-1]], title, subtitle)\n",
|
||
" chunks_length.append(len(sub_paragraphs_str_1))\n",
|
||
" chunks_length.append(len(sub_paragraphs_str_2))\n",
|
||
" else:\n",
|
||
" sub_paragraphs_str = passage_str(sub_paragraphs, title, subtitle)\n",
|
||
" chunks_length.append(len(sub_paragraphs_str))\n",
|
||
" break\n",
|
||
" else:\n",
|
||
" sub_paragraphs_str = passage_str(sub_paragraphs[:-1], title, subtitle)\n",
|
||
" chunks_length.append(len(sub_paragraphs_str))\n",
|
||
" paragraph_index -= 1\n",
|
||
" sub_paragraphs = []\n",
|
||
" sub_paragraphs_token_length = 0\n",
|
||
" \n",
|
||
" chunks_length.append(len(doc_str))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"id": "8a534ec5-a85a-41bf-b229-c896612cec42",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"{'title': 'Devenez sociétaire !',\n",
|
||
" 'Qui peut devenir sociétaire ?': ['Tous les clients de la Caisse d’Epargne peuvent souscrire des parts sociales : particuliers, personnes morales (associations, entreprises), EPCI (Établissements Publics de Coopération Intercommunale) à fiscalité propre. Les collectivités territoriales peuvent également devenir sociétaires.'],\n",
|
||
" 'Comment devenir sociétaire\\xa0?': ['Vous souscrivez vos parts sociales de la Société Locale d’Epargne (SLE) auprès de l’agence où est domicilié votre compte principal. Pour tout renseignement, contactez votre conseiller, il saura vous orienter.'],\n",
|
||
" 'Le site sociétaires': ['Sur www.societaires.caisse-epargne.fr, vous disposez d’un site d’information et d’avantages sélectionnés pour vous. Vous y découvrirez les réalisations et engagements de votre Caisse d’Epargne sur votre territoire : actualité, partenariats, soutien aux actions sociétales…',\n",
|
||
" 'C’est aussi une source incontournable d’informations sur l’organisation et les valeurs coopératives, les assemblées générales, la vie du sociétariat et des sociétés locales d’épargne.']}"
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"struct_page"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "fdd70455-c279-4b08-87e9-d42c5c093bc6",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "RAG_ENV",
|
||
"language": "python",
|
||
"name": "rag_env"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.9.18"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|