78 lines
3.4 KiB
Python
78 lines
3.4 KiB
Python
|
from bs4 import BeautifulSoup
|
||
|
import base64
|
||
|
import re
|
||
|
from transformers import AutoTokenizer
|
||
|
import logging
|
||
|
import os
|
||
|
|
||
|
html_folder_path = '../scrapcera/htmls/'
|
||
|
txt_folder_path = '../scrapcera/docs/'
|
||
|
|
||
|
for html_filename in ['f6d921ced8.html']: # os.listdir(html_folder_path):
|
||
|
|
||
|
html_file_path = os.path.join(html_folder_path, html_filename)
|
||
|
txt_filename = re.sub(r'\.html', '.txt', html_filename)
|
||
|
txt_file_path = os.path.join(txt_folder_path, txt_filename)
|
||
|
with open(txt_file_path, 'r') as file:
|
||
|
txt_file_contents = file.read()
|
||
|
|
||
|
url = txt_file_contents.split('\n')[0]
|
||
|
if '?' in url: # URLs with a '?' corresponds to call to services and have no useful content
|
||
|
continue
|
||
|
if not url.startswith('https://www.caisse-epargne.fr/rhone-alpes/'):
|
||
|
continue
|
||
|
|
||
|
prefix = 'https://www.caisse-epargne.fr/'
|
||
|
suffix = url.replace(prefix, '')
|
||
|
tags = suffix.split('/')
|
||
|
tags = [tag for tag in tags if tag] # remove empty par
|
||
|
with open(html_file_path, 'r') as file:
|
||
|
html_file_contents = file.read()
|
||
|
soup = BeautifulSoup(html_file_contents, 'html.parser')
|
||
|
page_title_present = soup.find('section').find('h1')
|
||
|
if not page_title_present:
|
||
|
continue
|
||
|
page_title = page_title_present.get_text()
|
||
|
|
||
|
sections = soup.find_all(lambda tag: tag.name in ['section'] and 'key-informations' not in tag.get('class', []))
|
||
|
|
||
|
struct_page = {'title': page_title}
|
||
|
current_section = ''
|
||
|
for section in sections:
|
||
|
# breakpoint()
|
||
|
for wysiwyg_tag in section.find_all(class_="wysiwyg"):
|
||
|
# Check for a title within the wysiwyg container
|
||
|
internal_title = wysiwyg_tag.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) or wysiwyg_tag.find('p', class_='title')
|
||
|
|
||
|
# If no internal title, find the nearest title from previous siblings
|
||
|
if not internal_title:
|
||
|
# Find the nearest title from previous siblings
|
||
|
nearest_title = None
|
||
|
for previous in wysiwyg_tag.find_all_previous():
|
||
|
if previous.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||
|
nearest_title = previous.get_text().strip()
|
||
|
break
|
||
|
if previous.name == 'p' and 'title' in previous.get('class', []):
|
||
|
nearest_title = previous.get_text().strip()
|
||
|
break
|
||
|
if nearest_title:
|
||
|
nearest_title = re.sub(r'\(\d\)', '', nearest_title)
|
||
|
nearest_title = re.sub(r'^\d+\.\s*', '', nearest_title)
|
||
|
current_section = nearest_title
|
||
|
struct_page[current_section] = []
|
||
|
else:
|
||
|
continue
|
||
|
for child in wysiwyg_tag.find_all(['p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
|
||
|
text = child.get_text().strip()
|
||
|
text = re.sub(r'\(\d\)', '', text)
|
||
|
if child.name.startswith('h') or (child.name == 'p' and 'title' in child.get('class', [])):
|
||
|
text = re.sub(r'^\d+\.\s*', '', text)
|
||
|
current_section = text
|
||
|
struct_page[current_section] = []
|
||
|
else: # <p> not of class title, or <li>
|
||
|
if 'is-style-mentions' not in child.get('class', []):
|
||
|
if current_section in struct_page:
|
||
|
struct_page[current_section].append(text)
|
||
|
|
||
|
print(struct_page)
|