In [1]:
from bs4 import BeautifulSoup
import base64
import re
from transformers import AutoTokenizer
import logging
import os
from IPython.display import Markdown, display

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


In [2]:
from transformers import AutoTokenizer
model_name = 'intfloat/multilingual-e5-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
def token_length(text):
    return len(tokenizer.encode(text, add_special_tokens=False))

In [4]:
def passage_str(paragraphs, title, subtitle):
    return f"passage: {title}\n\n{subtitle}\n\n" + '\n'.join(paragraphs)

In [5]:
html_folder_path = '../scrapcera/htmls/'
txt_folder_path = '../scrapcera/docs/'
html_filename = '97e88fd1d6.html'

In [8]:
for html_filename in ['f6d921ced8.html']: # os.listdir(html_folder_path):
    
    html_file_path = os.path.join(html_folder_path, html_filename)
    txt_filename = re.sub(r'\.html', '.txt', html_filename)
    txt_file_path = os.path.join(txt_folder_path, txt_filename)
    with open(txt_file_path, 'r') as file:
        txt_file_contents = file.read()
    
    url = txt_file_contents.split('\n')[0]
    if '?' in url: # URLs with a '?' corresponds to call to services and have no useful content
        continue
    if not url.startswith('https://www.caisse-epargne.fr/rhone-alpes/'):
        continue
    
    prefix = 'https://www.caisse-epargne.fr/'
    suffix = url.replace(prefix, '')
    tags = suffix.split('/')
    tags = [tag for tag in tags if tag] # remove empty par
    with open(html_file_path, 'r') as file:
        html_file_contents = file.read()
    soup = BeautifulSoup(html_file_contents, 'html.parser')
    page_title_present = soup.find('section').find('h1')
    if not page_title_present:
        continue
    page_title = page_title_present.get_text()
    
    sections = soup.find_all(lambda tag: tag.name in ['section'] and 'key-informations' not in tag.get('class', []))
    
    struct_page = {'title': page_title}
    current_section = ''
    for section in sections:
        for wysiwyg_tag in section.find_all(class_="wysiwyg"):
            # Check for a title within the wysiwyg container
            internal_title = wysiwyg_tag.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) or wysiwyg_tag.find('p', class_='title')
    
            # If no internal title, find the nearest title from previous siblings
            if not internal_title:
                # Find the nearest title from previous siblings
                nearest_title = None
                for sibling in wysiwyg_tag.find_previous_siblings():
                    if sibling.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                        nearest_title = sibling.get_text().strip()
                        break
                    if sibling.name == 'p' and 'title' in sibling.get('class', []):
                        nearest_title = sibling.get_text().strip()
                        break
                if nearest_title:
                    nearest_title = re.sub(r'\(\d\)', '', nearest_title)
                    nearest_title = re.sub(r'^\d+\.\s*', '', nearest_title)
                    current_section = nearest_title
                    struct_page[current_section] = []
                else:
                    continue
            for child in wysiwyg_tag.find_all(['p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
                text = child.get_text().strip()
                text = re.sub(r'\(\d\)', '', text)
                if child.name.startswith('h') or (child.name == 'p' and 'title' in child.get('class', [])):
                    text = re.sub(r'^\d+\.\s*', '', text)
                    current_section = text
                    struct_page[current_section] = []
                else: # <p> not of class title, or <li>
                    if 'is-style-mentions' not in child.get('class', []):
                        if current_section in struct_page:
                            struct_page[current_section].append(text)

    # detect_big_chunks(struct_page, html_filename)

In [7]:
chunks_length = []
def detect_big_chunks(struct_page, filename):
    global big_chunks_length
    max_chunk_size=512
    title = struct_page['title']
    for subtitle, paragraphs in struct_page.items():
        if subtitle != 'title':
            doc_str = passage_str(paragraphs, title, subtitle)
            doc_token_length = token_length(doc_str)
            if doc_token_length > max_chunk_size:
                sub_paragraphs = []
                sub_paragraphs_token_length = 0
                paragraph_index = 0
                while True:
                    while sub_paragraphs_token_length < max_chunk_size and paragraph_index < len(paragraphs):
                        sub_paragraphs.append(paragraphs[paragraph_index])
                        sub_paragraphs_str = passage_str(sub_paragraphs, title, subtitle)
                        sub_paragraphs_token_length = token_length(sub_paragraphs_str)
                        paragraph_index += 1
                    if paragraph_index == len(paragraphs):
                        if sub_paragraphs_token_length >= max_chunk_size:
                            sub_paragraphs_str_1 = passage_str(sub_paragraphs[:-1], title, subtitle)
                            sub_paragraphs_str_2 = passage_str([sub_paragraphs[-1]], title, subtitle)
                            chunks_length.append(len(sub_paragraphs_str_1))
                            chunks_length.append(len(sub_paragraphs_str_2))
                        else:
                            sub_paragraphs_str = passage_str(sub_paragraphs, title, subtitle)
                            chunks_length.append(len(sub_paragraphs_str))
                        break
                    else:
                        sub_paragraphs_str = passage_str(sub_paragraphs[:-1], title, subtitle)
                        chunks_length.append(len(sub_paragraphs_str))
                        paragraph_index -= 1
                        sub_paragraphs = []
                        sub_paragraphs_token_length = 0
                    
                chunks_length.append(len(doc_str))

In [9]:
struct_page

{'title': 'Devenez sociétaire !',
 'Qui peut devenir sociétaire ?': ['Tous les clients de la Caisse d’Epargne peuvent souscrire des parts sociales : particuliers, personnes morales (associations, entreprises), EPCI (Établissements Publics de Coopération Intercommunale) à fiscalité propre. Les collectivités territoriales peuvent également devenir sociétaires.'],
 'Comment devenir sociétaire\xa0?': ['Vous souscrivez vos parts sociales de la Société Locale d’Epargne (SLE) auprès de l’agence où est domicilié votre compte principal. Pour tout renseignement, contactez votre conseiller, il saura vous orienter.'],
 'Le site sociétaires': ['Sur www.societaires.caisse-epargne.fr, vous disposez d’un site d’information et d’avantages sélectionnés pour vous. Vous y découvrirez les réalisations et engagements de votre Caisse d’Epargne sur votre territoire : actualité, partenariats, soutien aux actions sociétales…',
  'C’est aussi une source incontournable d’informations sur l’organisation et les val