personal-data/personal_data/html_util.py

import re
import bs4

HTML_TAGS_MOSTLY_CONTENTLESS: set[str] = {'style', 'svg', 'link', 'br', 'math',
                                          'canvas'}

HTML_TAGS_WITH_LITTLE_CONTENT: set[str] = { 'head', 'script', 'meta' } | HTML_TAGS_MOSTLY_CONTENTLESS

def normalize_text(text: str) -> str:
    text = text.replace('\t', ' ')
    text = text.replace('\r', '')
    text = re.sub(r'\s*\n\s*\n\s*', '\n\n', text)
    text = re.sub(r'  +', ' ', text)
    text = re.sub(r'^\s+', '', text)
    text = re.sub(r'\s+$', '', text)
    return text.encode('utf-8')

def normalize_soup_bs4(soup: bs4.BeautifulSoup) -> bytes:
    for comment in soup(text=lambda text: isinstance(text, bs4.Comment)):
        comment.extract()
        del comment
    for element_name in HTML_TAGS_WITH_LITTLE_CONTENT:
        for script_elements in soup(element_name):
            script_elements.decompose()
        del element_name
    soup.smooth()
    return soup

def normalize_soup_lxml(soup) -> bytes:
    for element_name in HTML_TAGS_WITH_LITTLE_CONTENT:
        for script_elements in soup.cssselect(element_name):
            script_elements.drop_tree()
            del script_elements
        del element_name
    return soup

def normalize_soup(soup) -> bytes:
    text = None
    if isinstance(soup, bs4.BeautifulSoup):
        text = normalize_soup_bs4(soup).get_text()
    else:
        text = normalize_soup_lxml(soup).text_content()
    return normalize_text(text)

def normalize_soup_slightly(soup, classes = True, scripts = True, comments = True):
    # Little if any content
    for tag in HTML_TAGS_MOSTLY_CONTENTLESS:
        for e in soup.select(tag):
            e.decompose()

    if classes:
        for e in soup.find_all(class_=True):
            del e['class']
    for e in soup.find_all('script', src=True):
        e.decompose()
    for e in soup.find_all(style=True):
        del e['style']
    for e in soup.select('a'):
        del e['height'], e['target'], e['rel'], e['onclick']

    for e in soup.select('a[href=""]'):
        del e['href']

    if scripts:
        for e in soup.find_all('script'):
            e.decompose()

    if comments:
        for c in soup.find_all(string=lambda text: isinstance(text, bs4.Comment)):
            c.extract()

    soup.smooth()
    return soup