from bs4 import BeautifulSoup import base64 import re from transformers import AutoTokenizer import logging import os html_folder_path = '../scrapcera/htmls/' txt_folder_path = '../scrapcera/docs/' for html_filename in ['dd0a2ca5ac.html']: # os.listdir(html_folder_path): html_file_path = os.path.join(html_folder_path, html_filename) txt_filename = re.sub(r'\.html', '.txt', html_filename) txt_file_path = os.path.join(txt_folder_path, txt_filename) with open(txt_file_path, 'r') as file: txt_file_contents = file.read() url = txt_file_contents.split('\n')[0] if '?' in url: # URLs with a '?' corresponds to call to services and have no useful content continue if not url.startswith('https://www.caisse-epargne.fr/rhone-alpes/'): continue prefix = 'https://www.caisse-epargne.fr/' suffix = url.replace(prefix, '') tags = suffix.split('/') tags = [tag for tag in tags if tag] # remove empty par with open(html_file_path, 'r') as file: html_file_contents = file.read() soup = BeautifulSoup(html_file_contents, 'html.parser') page_title_present = soup.find('section').find('h1') if not page_title_present: continue page_title = page_title_present.get_text() sections = soup.find_all(lambda tag: tag.name in ['section']) struct_page = {'title': page_title} current_section = '' titles = [page_title] for section in sections: # breakpoint() if 'key-informations' in section.get('class', []): key_items = [] for key_item in section.find_all('div', class_='container-block'): key_item_text = '' for key_item_title in key_item.find_all('div', class_='button'): key_item_text += key_item_title.get_text().strip() for key_item_desc in key_item.find_all('div', class_="tab-panel"): key_item_text += ' ' + key_item_desc.get_text().strip() if len(key_item_text) > 0: key_items.append(key_item_text) if len(key_items) > 0: struct_page['Les points clés'] = key_items continue for wysiwyg_tag in section.find_all(class_="wysiwyg"): # Check for a title within the wysiwyg container internal_title = wysiwyg_tag.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) or wysiwyg_tag.find('p', class_='title') if internal_title: title_tag = internal_title title = internal_title.get_text().strip() title = re.sub(r'\(\d\)', '', title) title = re.sub(r'^\d+\.\s*', '', title) titles.append(title) current_section = title else: # If no internal title, find the nearest title from previous tags title_tag = None current_section = titles[-1] # nearest_title = None # for previous in wysiwyg_tag.find_all_previous(): # if previous.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: # nearest_title = previous.get_text().strip() # break # if previous.name == 'p' and 'title' in previous.get('class', []): # nearest_title = previous.get_text().strip() # break # if nearest_title: # title_tag = previous # nearest_title = re.sub(r'\(\d\)', '', nearest_title) # nearest_title = re.sub(r'^\d+\.\s*', '', nearest_title) # current_section = nearest_title # else: # continue if current_section not in struct_page: struct_page[current_section] = [] for child in wysiwyg_tag.find_all(['p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']): if child == title_tag: continue if 'is-style-mentions' in child.get('class', []): continue text = child.get_text().strip() text = re.sub(r'\(\d\)', '', text) struct_page[current_section].append(text) if len(struct_page[current_section]) == 0: del struct_page[current_section] print(struct_page)