rag/rag_fr_embedding_test.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d8acc709-ebb2-4fa6-982b-3d13fe8d2beb",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/peportier/miniforge3/envs/RAG_ENV/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "/Users/peportier/miniforge3/envs/RAG_ENV/lib/python3.9/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
      "  _torch_pytree._register_pytree_node(\n"
     ]
    }
   ],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import base64\n",
    "import re\n",
    "from transformers import AutoTokenizer\n",
    "import logging\n",
    "import os\n",
    "from IPython.display import Markdown, display"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "54f5ab50-2ee3-45ad-9208-c1e2dc362152",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer\n",
    "model_name = 'intfloat/multilingual-e5-large'\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "8c9e6f4e-609d-488d-a738-41934a62e92a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def token_length(text):\n",
    "    return len(tokenizer.encode(text, add_special_tokens=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8deef599-04b0-4d9f-9b3e-ac9ae5a472a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def passage_str(paragraphs, title, subtitle):\n",
    "    return f\"passage: {title}\\n\\n{subtitle}\\n\\n\" + '\\n'.join(paragraphs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1ef97436-37c2-45b4-8e00-7737d87c261e",
   "metadata": {},
   "outputs": [],
   "source": [
    "html_folder_path = '../scrapcera/htmls/'\n",
    "txt_folder_path = '../scrapcera/docs/'\n",
    "html_filename = '97e88fd1d6.html'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "560280af-ad79-43e8-b4df-c4f69aa40dcf",
   "metadata": {},
   "outputs": [],
   "source": [
    "for html_filename in ['f6d921ced8.html']: # os.listdir(html_folder_path):\n",
    "    \n",
    "    html_file_path = os.path.join(html_folder_path, html_filename)\n",
    "    txt_filename = re.sub(r'\\.html', '.txt', html_filename)\n",
    "    txt_file_path = os.path.join(txt_folder_path, txt_filename)\n",
    "    with open(txt_file_path, 'r') as file:\n",
    "        txt_file_contents = file.read()\n",
    "    \n",
    "    url = txt_file_contents.split('\\n')[0]\n",
    "    if '?' in url: # URLs with a '?' corresponds to call to services and have no useful content\n",
    "        continue\n",
    "    if not url.startswith('https://www.caisse-epargne.fr/rhone-alpes/'):\n",
    "        continue\n",
    "    \n",
    "    prefix = 'https://www.caisse-epargne.fr/'\n",
    "    suffix = url.replace(prefix, '')\n",
    "    tags = suffix.split('/')\n",
    "    tags = [tag for tag in tags if tag] # remove empty par\n",
    "    with open(html_file_path, 'r') as file:\n",
    "        html_file_contents = file.read()\n",
    "    soup = BeautifulSoup(html_file_contents, 'html.parser')\n",
    "    page_title_present = soup.find('section').find('h1')\n",
    "    if not page_title_present:\n",
    "        continue\n",
    "    page_title = page_title_present.get_text()\n",
    "    \n",
    "    sections = soup.find_all(lambda tag: tag.name in ['section'] and 'key-informations' not in tag.get('class', []))\n",
    "    \n",
    "    struct_page = {'title': page_title}\n",
    "    current_section = ''\n",
    "    for section in sections:\n",
    "        for wysiwyg_tag in section.find_all(class_=\"wysiwyg\"):\n",
    "            # Check for a title within the wysiwyg container\n",
    "            internal_title = wysiwyg_tag.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) or wysiwyg_tag.find('p', class_='title')\n",
    "    \n",
    "            # If no internal title, find the nearest title from previous siblings\n",
    "            if not internal_title:\n",
    "                # Find the nearest title from previous siblings\n",
    "                nearest_title = None\n",
    "                for sibling in wysiwyg_tag.find_previous_siblings():\n",
    "                    if sibling.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:\n",
    "                        nearest_title = sibling.get_text().strip()\n",
    "                        break\n",
    "                    if sibling.name == 'p' and 'title' in sibling.get('class', []):\n",
    "                        nearest_title = sibling.get_text().strip()\n",
    "                        break\n",
    "                if nearest_title:\n",
    "                    nearest_title = re.sub(r'\\(\\d\\)', '', nearest_title)\n",
    "                    nearest_title = re.sub(r'^\\d+\\.\\s*', '', nearest_title)\n",
    "                    current_section = nearest_title\n",
    "                    struct_page[current_section] = []\n",
    "                else:\n",
    "                    continue\n",
    "            for child in wysiwyg_tag.find_all(['p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):\n",
    "                text = child.get_text().strip()\n",
    "                text = re.sub(r'\\(\\d\\)', '', text)\n",
    "                if child.name.startswith('h') or (child.name == 'p' and 'title' in child.get('class', [])):\n",
    "                    text = re.sub(r'^\\d+\\.\\s*', '', text)\n",
    "                    current_section = text\n",
    "                    struct_page[current_section] = []\n",
    "                else: # <p> not of class title, or <li>\n",
    "                    if 'is-style-mentions' not in child.get('class', []):\n",
    "                        if current_section in struct_page:\n",
    "                            struct_page[current_section].append(text)\n",
    "\n",
    "    # detect_big_chunks(struct_page, html_filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "e6da54d7-6c70-44eb-b08b-392c742d0525",
   "metadata": {},
   "outputs": [],
   "source": [
    "chunks_length = []\n",
    "def detect_big_chunks(struct_page, filename):\n",
    "    global big_chunks_length\n",
    "    max_chunk_size=512\n",
    "    title = struct_page['title']\n",
    "    for subtitle, paragraphs in struct_page.items():\n",
    "        if subtitle != 'title':\n",
    "            doc_str = passage_str(paragraphs, title, subtitle)\n",
    "            doc_token_length = token_length(doc_str)\n",
    "            if doc_token_length > max_chunk_size:\n",
    "                sub_paragraphs = []\n",
    "                sub_paragraphs_token_length = 0\n",
    "                paragraph_index = 0\n",
    "                while True:\n",
    "                    while sub_paragraphs_token_length < max_chunk_size and paragraph_index < len(paragraphs):\n",
    "                        sub_paragraphs.append(paragraphs[paragraph_index])\n",
    "                        sub_paragraphs_str = passage_str(sub_paragraphs, title, subtitle)\n",
    "                        sub_paragraphs_token_length = token_length(sub_paragraphs_str)\n",
    "                        paragraph_index += 1\n",
    "                    if paragraph_index == len(paragraphs):\n",
    "                        if sub_paragraphs_token_length >= max_chunk_size:\n",
    "                            sub_paragraphs_str_1 = passage_str(sub_paragraphs[:-1], title, subtitle)\n",
    "                            sub_paragraphs_str_2 = passage_str([sub_paragraphs[-1]], title, subtitle)\n",
    "                            chunks_length.append(len(sub_paragraphs_str_1))\n",
    "                            chunks_length.append(len(sub_paragraphs_str_2))\n",
    "                        else:\n",
    "                            sub_paragraphs_str = passage_str(sub_paragraphs, title, subtitle)\n",
    "                            chunks_length.append(len(sub_paragraphs_str))\n",
    "                        break\n",
    "                    else:\n",
    "                        sub_paragraphs_str = passage_str(sub_paragraphs[:-1], title, subtitle)\n",
    "                        chunks_length.append(len(sub_paragraphs_str))\n",
    "                        paragraph_index -= 1\n",
    "                        sub_paragraphs = []\n",
    "                        sub_paragraphs_token_length = 0\n",
    "                    \n",
    "                chunks_length.append(len(doc_str))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "8a534ec5-a85a-41bf-b229-c896612cec42",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'title': 'Devenez sociétaire !',\n",
       " 'Qui peut devenir sociétaire ?': ['Tous les clients de la Caisse d’Epargne peuvent souscrire des parts sociales : particuliers, personnes morales (associations, entreprises), EPCI (Établissements Publics de Coopération Intercommunale) à fiscalité propre. Les collectivités territoriales peuvent également devenir sociétaires.'],\n",
       " 'Comment devenir sociétaire\\xa0?': ['Vous souscrivez vos parts sociales de la Société Locale d’Epargne (SLE) auprès de l’agence où est domicilié votre compte principal. Pour tout renseignement, contactez votre conseiller, il saura vous orienter.'],\n",
       " 'Le site sociétaires': ['Sur www.societaires.caisse-epargne.fr, vous disposez d’un site d’information et d’avantages sélectionnés pour vous. Vous y découvrirez les réalisations et engagements de votre Caisse d’Epargne sur votre territoire : actualité, partenariats, soutien aux actions sociétales…',\n",
       "  'C’est aussi une source incontournable d’informations sur l’organisation et les valeurs coopératives, les assemblées générales, la vie du sociétariat et des sociétés locales d’épargne.']}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "struct_page"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fdd70455-c279-4b08-87e9-d42c5c093bc6",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "RAG_ENV",
   "language": "python",
   "name": "rag_env"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}