104 lines
4.3 KiB
Python
104 lines
4.3 KiB
Python
from bs4 import BeautifulSoup
|
|
import base64
|
|
import re
|
|
from transformers import AutoTokenizer
|
|
import logging
|
|
import os
|
|
|
|
html_folder_path = '../scrapcera/htmls/'
|
|
txt_folder_path = '../scrapcera/docs/'
|
|
|
|
for html_filename in ['dd0a2ca5ac.html']: # os.listdir(html_folder_path):
|
|
html_file_path = os.path.join(html_folder_path, html_filename)
|
|
txt_filename = re.sub(r'\.html', '.txt', html_filename)
|
|
txt_file_path = os.path.join(txt_folder_path, txt_filename)
|
|
with open(txt_file_path, 'r') as file:
|
|
txt_file_contents = file.read()
|
|
|
|
url = txt_file_contents.split('\n')[0]
|
|
if '?' in url: # URLs with a '?' corresponds to call to services and have no useful content
|
|
continue
|
|
if not url.startswith('https://www.caisse-epargne.fr/rhone-alpes/'):
|
|
continue
|
|
|
|
prefix = 'https://www.caisse-epargne.fr/'
|
|
suffix = url.replace(prefix, '')
|
|
tags = suffix.split('/')
|
|
tags = [tag for tag in tags if tag] # remove empty par
|
|
with open(html_file_path, 'r') as file:
|
|
html_file_contents = file.read()
|
|
soup = BeautifulSoup(html_file_contents, 'html.parser')
|
|
page_title_present = soup.find('section').find('h1')
|
|
if not page_title_present:
|
|
continue
|
|
page_title = page_title_present.get_text()
|
|
|
|
sections = soup.find_all(lambda tag: tag.name in ['section'])
|
|
|
|
struct_page = {'title': page_title}
|
|
current_section = ''
|
|
titles = [page_title]
|
|
for section in sections:
|
|
# breakpoint()
|
|
if 'key-informations' in section.get('class', []):
|
|
key_items = []
|
|
for key_item in section.find_all('div', class_='container-block'):
|
|
key_item_text = ''
|
|
for key_item_title in key_item.find_all('div', class_='button'):
|
|
key_item_text += key_item_title.get_text().strip()
|
|
for key_item_desc in key_item.find_all('div', class_="tab-panel"):
|
|
key_item_text += ' ' + key_item_desc.get_text().strip()
|
|
if len(key_item_text) > 0:
|
|
key_items.append(key_item_text)
|
|
if len(key_items) > 0:
|
|
struct_page['Les points clés'] = key_items
|
|
continue
|
|
|
|
for wysiwyg_tag in section.find_all(class_="wysiwyg"):
|
|
# Check for a title within the wysiwyg container
|
|
internal_title = wysiwyg_tag.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) or wysiwyg_tag.find('p', class_='title')
|
|
|
|
if internal_title:
|
|
title_tag = internal_title
|
|
title = internal_title.get_text().strip()
|
|
title = re.sub(r'\(\d\)', '', title)
|
|
title = re.sub(r'^\d+\.\s*', '', title)
|
|
titles.append(title)
|
|
current_section = title
|
|
else: # If no internal title, find the nearest title from previous tags
|
|
title_tag = None
|
|
current_section = titles[-1]
|
|
|
|
# nearest_title = None
|
|
# for previous in wysiwyg_tag.find_all_previous():
|
|
# if previous.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
|
# nearest_title = previous.get_text().strip()
|
|
# break
|
|
# if previous.name == 'p' and 'title' in previous.get('class', []):
|
|
# nearest_title = previous.get_text().strip()
|
|
# break
|
|
# if nearest_title:
|
|
# title_tag = previous
|
|
# nearest_title = re.sub(r'\(\d\)', '', nearest_title)
|
|
# nearest_title = re.sub(r'^\d+\.\s*', '', nearest_title)
|
|
# current_section = nearest_title
|
|
# else:
|
|
# continue
|
|
|
|
if current_section not in struct_page:
|
|
struct_page[current_section] = []
|
|
|
|
for child in wysiwyg_tag.find_all(['p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
|
|
if child == title_tag:
|
|
continue
|
|
if 'is-style-mentions' in child.get('class', []):
|
|
continue
|
|
text = child.get_text().strip()
|
|
text = re.sub(r'\(\d\)', '', text)
|
|
struct_page[current_section].append(text)
|
|
|
|
if len(struct_page[current_section]) == 0:
|
|
del struct_page[current_section]
|
|
|
|
print(struct_page)
|