rag/debug2.py

from bs4 import BeautifulSoup
import base64
import re
from transformers import AutoTokenizer
import logging
import os

html_folder_path = '../scrapcera/htmls/'
txt_folder_path = '../scrapcera/docs/'

for html_filename in ['f6d921ced8.html']: # os.listdir(html_folder_path):
    
    html_file_path = os.path.join(html_folder_path, html_filename)
    txt_filename = re.sub(r'\.html', '.txt', html_filename)
    txt_file_path = os.path.join(txt_folder_path, txt_filename)
    with open(txt_file_path, 'r') as file:
        txt_file_contents = file.read()
    
    url = txt_file_contents.split('\n')[0]
    if '?' in url: # URLs with a '?' corresponds to call to services and have no useful content
        continue
    if not url.startswith('https://www.caisse-epargne.fr/rhone-alpes/'):
        continue
    
    prefix = 'https://www.caisse-epargne.fr/'
    suffix = url.replace(prefix, '')
    tags = suffix.split('/')
    tags = [tag for tag in tags if tag] # remove empty par
    with open(html_file_path, 'r') as file:
        html_file_contents = file.read()
    soup = BeautifulSoup(html_file_contents, 'html.parser')
    page_title_present = soup.find('section').find('h1')
    if not page_title_present:
        continue
    page_title = page_title_present.get_text()
    
    sections = soup.find_all(lambda tag: tag.name in ['section'] and 'key-informations' not in tag.get('class', []))
    
    struct_page = {'title': page_title}
    current_section = ''
    for section in sections:
        # breakpoint()
        for wysiwyg_tag in section.find_all(class_="wysiwyg"):
            # Check for a title within the wysiwyg container
            internal_title = wysiwyg_tag.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) or wysiwyg_tag.find('p', class_='title')
    
            # If no internal title, find the nearest title from previous siblings
            if not internal_title:
                # Find the nearest title from previous siblings
                nearest_title = None
                for previous in wysiwyg_tag.find_all_previous():
                    if previous.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                        nearest_title = previous.get_text().strip()
                        break
                    if previous.name == 'p' and 'title' in previous.get('class', []):
                        nearest_title = previous.get_text().strip()
                        break
                if nearest_title:
                    nearest_title = re.sub(r'\(\d\)', '', nearest_title)
                    nearest_title = re.sub(r'^\d+\.\s*', '', nearest_title)
                    current_section = nearest_title
                    struct_page[current_section] = []
                else:
                    continue
            for child in wysiwyg_tag.find_all(['p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
                text = child.get_text().strip()
                text = re.sub(r'\(\d\)', '', text)
                if child.name.startswith('h') or (child.name == 'p' and 'title' in child.get('class', [])):
                    text = re.sub(r'^\d+\.\s*', '', text)
                    current_section = text
                    struct_page[current_section] = []
                else: # <p> not of class title, or <li>
                    if 'is-style-mentions' not in child.get('class', []):
                        if current_section in struct_page:
                            struct_page[current_section].append(text)

print(struct_page)
20240110 2024-01-10 09:06:42 -05:00			`from bs4 import BeautifulSoup`
			`import base64`
			`import re`
			`from transformers import AutoTokenizer`
			`import logging`
			`import os`

			`html_folder_path = '../scrapcera/htmls/'`
			`txt_folder_path = '../scrapcera/docs/'`

			`for html_filename in ['f6d921ced8.html']: # os.listdir(html_folder_path):`

			`html_file_path = os.path.join(html_folder_path, html_filename)`
			`txt_filename = re.sub(r'\.html', '.txt', html_filename)`
			`txt_file_path = os.path.join(txt_folder_path, txt_filename)`
			`with open(txt_file_path, 'r') as file:`
			`txt_file_contents = file.read()`

			`url = txt_file_contents.split('\n')[0]`
			`if '?' in url: # URLs with a '?' corresponds to call to services and have no useful content`
			`continue`
			`if not url.startswith('https://www.caisse-epargne.fr/rhone-alpes/'):`
			`continue`

			`prefix = 'https://www.caisse-epargne.fr/'`
			`suffix = url.replace(prefix, '')`
			`tags = suffix.split('/')`
			`tags = [tag for tag in tags if tag] # remove empty par`
			`with open(html_file_path, 'r') as file:`
			`html_file_contents = file.read()`
			`soup = BeautifulSoup(html_file_contents, 'html.parser')`
			`page_title_present = soup.find('section').find('h1')`
			`if not page_title_present:`
			`continue`
			`page_title = page_title_present.get_text()`

			`sections = soup.find_all(lambda tag: tag.name in ['section'] and 'key-informations' not in tag.get('class', []))`

			`struct_page = {'title': page_title}`
			`current_section = ''`
			`for section in sections:`
			`# breakpoint()`
			`for wysiwyg_tag in section.find_all(class_="wysiwyg"):`
			`# Check for a title within the wysiwyg container`
			`internal_title = wysiwyg_tag.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) or wysiwyg_tag.find('p', class_='title')`

			`# If no internal title, find the nearest title from previous siblings`
			`if not internal_title:`
			`# Find the nearest title from previous siblings`
			`nearest_title = None`
			`for previous in wysiwyg_tag.find_all_previous():`
			`if previous.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:`
			`nearest_title = previous.get_text().strip()`
			`break`
			`if previous.name == 'p' and 'title' in previous.get('class', []):`
			`nearest_title = previous.get_text().strip()`
			`break`
			`if nearest_title:`
			`nearest_title = re.sub(r'\(\d\)', '', nearest_title)`
			`nearest_title = re.sub(r'^\d+\.\s*', '', nearest_title)`
			`current_section = nearest_title`
			`struct_page[current_section] = []`
			`else:`
			`continue`
			`for child in wysiwyg_tag.find_all(['p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):`
			`text = child.get_text().strip()`
			`text = re.sub(r'\(\d\)', '', text)`
			`if child.name.startswith('h') or (child.name == 'p' and 'title' in child.get('class', [])):`
			`text = re.sub(r'^\d+\.\s*', '', text)`
			`current_section = text`
			`struct_page[current_section] = []`
			`else: # <p> not of class title, or <li>`
			`if 'is-style-mentions' not in child.get('class', []):`
			`if current_section in struct_page:`
			`struct_page[current_section].append(text)`

			`print(struct_page)`