from bs4 import BeautifulSoup import base64 import re from transformers import AutoTokenizer import logging import os html_folder_path = '../scrapcera/htmls/' txt_folder_path = '../scrapcera/docs/' for html_filename in ['f6d921ced8.html']: # os.listdir(html_folder_path): html_file_path = os.path.join(html_folder_path, html_filename) txt_filename = re.sub(r'\.html', '.txt', html_filename) txt_file_path = os.path.join(txt_folder_path, txt_filename) with open(txt_file_path, 'r') as file: txt_file_contents = url = txt_file_contents.split('\n')[0] if '?' in url: # URLs with a '?' corresponds to call to services and have no useful content continue if not url.startswith(''): continue prefix = '' suffix = url.replace(prefix, '') tags = suffix.split('/') tags = [tag for tag in tags if tag] # remove empty par with open(html_file_path, 'r') as file: html_file_contents = soup = BeautifulSoup(html_file_contents, 'html.parser') page_title_present = soup.find('section').find('h1') if not page_title_present: continue page_title = page_title_present.get_text() sections = soup.find_all(lambda tag: in ['section'] and 'key-informations' not in tag.get('class', [])) struct_page = {'title': page_title} current_section = '' for section in sections: # breakpoint() for wysiwyg_tag in section.find_all(class_="wysiwyg"): # Check for a title within the wysiwyg container internal_title = wysiwyg_tag.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) or wysiwyg_tag.find('p', class_='title') # If no internal title, find the nearest title from previous siblings if not internal_title: # Find the nearest title from previous siblings nearest_title = None for previous in wysiwyg_tag.find_all_previous(): if in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: nearest_title = previous.get_text().strip() break if == 'p' and 'title' in previous.get('class', []): nearest_title = previous.get_text().strip() break if nearest_title: nearest_title = re.sub(r'\(\d\)', '', nearest_title) nearest_title = re.sub(r'^\d+\.\s*', '', nearest_title) current_section = nearest_title struct_page[current_section] = [] else: continue for child in wysiwyg_tag.find_all(['p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']): text = child.get_text().strip() text = re.sub(r'\(\d\)', '', text) if'h') or ( == 'p' and 'title' in child.get('class', [])): text = re.sub(r'^\d+\.\s*', '', text) current_section = text struct_page[current_section] = [] else: #

not of class title, or

  • if 'is-style-mentions' not in child.get('class', []): if current_section in struct_page: struct_page[current_section].append(text) print(struct_page)