78 lines
3.4 KiB
Raw Normal View History

2024-01-10 14:06:42 +00:00
from bs4 import BeautifulSoup
import base64
import re
from transformers import AutoTokenizer
import logging
import os
html_folder_path = '../scrapcera/htmls/'
txt_folder_path = '../scrapcera/docs/'
for html_filename in ['f6d921ced8.html']: # os.listdir(html_folder_path):
html_file_path = os.path.join(html_folder_path, html_filename)
txt_filename = re.sub(r'\.html', '.txt', html_filename)
txt_file_path = os.path.join(txt_folder_path, txt_filename)
with open(txt_file_path, 'r') as file:
txt_file_contents = file.read()
url = txt_file_contents.split('\n')[0]
if '?' in url: # URLs with a '?' corresponds to call to services and have no useful content
if not url.startswith('https://www.caisse-epargne.fr/rhone-alpes/'):
prefix = 'https://www.caisse-epargne.fr/'
suffix = url.replace(prefix, '')
tags = suffix.split('/')
tags = [tag for tag in tags if tag] # remove empty par
with open(html_file_path, 'r') as file:
html_file_contents = file.read()
soup = BeautifulSoup(html_file_contents, 'html.parser')
page_title_present = soup.find('section').find('h1')
if not page_title_present:
page_title = page_title_present.get_text()
sections = soup.find_all(lambda tag: tag.name in ['section'] and 'key-informations' not in tag.get('class', []))
struct_page = {'title': page_title}
current_section = ''
for section in sections:
# breakpoint()
for wysiwyg_tag in section.find_all(class_="wysiwyg"):
# Check for a title within the wysiwyg container
internal_title = wysiwyg_tag.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) or wysiwyg_tag.find('p', class_='title')
# If no internal title, find the nearest title from previous siblings
if not internal_title:
# Find the nearest title from previous siblings
nearest_title = None
for previous in wysiwyg_tag.find_all_previous():
if previous.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
nearest_title = previous.get_text().strip()
if previous.name == 'p' and 'title' in previous.get('class', []):
nearest_title = previous.get_text().strip()
if nearest_title:
nearest_title = re.sub(r'\(\d\)', '', nearest_title)
nearest_title = re.sub(r'^\d+\.\s*', '', nearest_title)
current_section = nearest_title
struct_page[current_section] = []
for child in wysiwyg_tag.find_all(['p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
text = child.get_text().strip()
text = re.sub(r'\(\d\)', '', text)
if child.name.startswith('h') or (child.name == 'p' and 'title' in child.get('class', [])):
text = re.sub(r'^\d+\.\s*', '', text)
current_section = text
struct_page[current_section] = []
else: # <p> not of class title, or <li>
if 'is-style-mentions' not in child.get('class', []):
if current_section in struct_page: