personal-data/personal_data/html_util.py

import re

import bs4

HTML_TAGS_MOSTLY_CONTENTLESS: set[str] = {
    'style',
    'svg',
    'link',
    'br',
    'math',
    'canvas',
}

HTML_TAGS_WITH_LITTLE_CONTENT: set[str] = {
    'head',
    'script',
    'meta',
} | HTML_TAGS_MOSTLY_CONTENTLESS


def normalize_text(text: str) -> str:
    text = text.replace('\t', ' ')
    text = text.replace('\r', '')
    text = re.sub(r'\s*\n\s*\n\s*', '\n\n', text)
    text = re.sub(r'  +', ' ', text)
    text = re.sub(r'^\s+', '', text)
    text = re.sub(r'\s+$', '', text)
    return text.encode('utf-8')


def normalize_soup_bs4(soup: bs4.BeautifulSoup) -> bytes:
    for comment in soup(text=lambda text: isinstance(text, bs4.Comment)):
        comment.extract()
        del comment
    for element_name in HTML_TAGS_WITH_LITTLE_CONTENT:
        for script_elements in soup(element_name):
            script_elements.decompose()
        del element_name
    soup.smooth()
    return soup


def normalize_soup_lxml(soup) -> bytes:
    for element_name in HTML_TAGS_WITH_LITTLE_CONTENT:
        for script_elements in soup.cssselect(element_name):
            script_elements.drop_tree()
            del script_elements
        del element_name
    return soup


def normalize_soup(soup) -> bytes:
    text = None
    if isinstance(soup, bs4.BeautifulSoup):
        text = normalize_soup_bs4(soup).get_text()
    else:
        text = normalize_soup_lxml(soup).text_content()
    return normalize_text(text)


def data_attributes_of_element(e):
    for attr_key in list(e.attrs.keys()):
        if attr_key.startswith('data-'):
            yield attr_key


def has_data_attribute(e) -> bool:
    for attr_key in data_attributes_of_element(e):
        return True
    return False


def normalize_soup_slightly(
    soup,
    classes=True,
    scripts=True,
    comments=True,
    data_attributes=True,
):
    """Perform soup normalization."""
    # Little if any content
    for tag in HTML_TAGS_MOSTLY_CONTENTLESS:
        for e in soup.select(tag):
            e.decompose()

    if classes:
        for e in soup.find_all(class_=True):
            del e['class']
    for e in soup.find_all('script', src=True):
        e.decompose()
    for e in soup.find_all(style=True):
        del e['style']
    for e in soup.select('a'):
        del e['height'], e['target'], e['rel'], e['onclick']

    for e in soup.select('a[href=""]'):
        del e['href']

    if scripts:
        for e in soup.find_all('script'):
            e.decompose()

    if comments:
        for c in soup.find_all(string=lambda text: isinstance(text, bs4.Comment)):
            c.extract()

    if data_attributes:
        for e in soup.find_all(has_data_attribute):
            for attr_key in data_attributes_of_element(e):
                del e[attr_key], attr_key
            del e

    soup.smooth()
    return soup
PSN Profiles 2024-02-25 00:38:44 +00:00			`import re`
Ruff format 2024-03-31 22:55:55 +00:00
PSN Profiles 2024-02-25 00:38:44 +00:00			`import bs4`

Ruff format 2024-03-31 22:55:55 +00:00			`HTML_TAGS_MOSTLY_CONTENTLESS: set[str] = {`
			`'style',`
			`'svg',`
			`'link',`
			`'br',`
			`'math',`
			`'canvas',`
			`}`

			`HTML_TAGS_WITH_LITTLE_CONTENT: set[str] = {`
			`'head',`
			`'script',`
			`'meta',`
			`} \| HTML_TAGS_MOSTLY_CONTENTLESS`
PSN Profiles 2024-02-25 00:38:44 +00:00

			`def normalize_text(text: str) -> str:`
			`text = text.replace('\t', ' ')`
			`text = text.replace('\r', '')`
			`text = re.sub(r'\s\n\s\n\s*', '\n\n', text)`
			`text = re.sub(r' +', ' ', text)`
			`text = re.sub(r'^\s+', '', text)`
			`text = re.sub(r'\s+$', '', text)`
			`return text.encode('utf-8')`

Ruff format 2024-03-31 22:55:55 +00:00
PSN Profiles 2024-02-25 00:38:44 +00:00			`def normalize_soup_bs4(soup: bs4.BeautifulSoup) -> bytes:`
			`for comment in soup(text=lambda text: isinstance(text, bs4.Comment)):`
			`comment.extract()`
			`del comment`
			`for element_name in HTML_TAGS_WITH_LITTLE_CONTENT:`
			`for script_elements in soup(element_name):`
			`script_elements.decompose()`
			`del element_name`
			`soup.smooth()`
			`return soup`

Ruff format 2024-03-31 22:55:55 +00:00
PSN Profiles 2024-02-25 00:38:44 +00:00			`def normalize_soup_lxml(soup) -> bytes:`
			`for element_name in HTML_TAGS_WITH_LITTLE_CONTENT:`
			`for script_elements in soup.cssselect(element_name):`
			`script_elements.drop_tree()`
			`del script_elements`
			`del element_name`
			`return soup`

Ruff format 2024-03-31 22:55:55 +00:00
PSN Profiles 2024-02-25 00:38:44 +00:00			`def normalize_soup(soup) -> bytes:`
			`text = None`
			`if isinstance(soup, bs4.BeautifulSoup):`
			`text = normalize_soup_bs4(soup).get_text()`
			`else:`
			`text = normalize_soup_lxml(soup).text_content()`
			`return normalize_text(text)`

Ruff 2024-07-25 11:06:05 +00:00
Tavex 2024-07-25 11:05:50 +00:00			`def data_attributes_of_element(e):`
			`for attr_key in list(e.attrs.keys()):`
			`if attr_key.startswith('data-'):`
			`yield attr_key`

Ruff 2024-07-25 11:06:05 +00:00
Tavex 2024-07-25 11:05:50 +00:00			`def has_data_attribute(e) -> bool:`
			`for attr_key in data_attributes_of_element(e):`
			`return True`
			`return False`

Ruff 2024-07-25 11:06:05 +00:00
			`def normalize_soup_slightly(`
Ruff 2024-08-25 18:50:03 +00:00			`soup,`
			`classes=True,`
			`scripts=True,`
			`comments=True,`
			`data_attributes=True,`
Ruff 2024-07-25 11:06:05 +00:00			`):`
Tavex 2024-07-25 11:05:50 +00:00			`"""Perform soup normalization."""`
PSN Profiles 2024-02-25 00:38:44 +00:00			`# Little if any content`
			`for tag in HTML_TAGS_MOSTLY_CONTENTLESS:`
			`for e in soup.select(tag):`
			`e.decompose()`

			`if classes:`
			`for e in soup.find_all(class_=True):`
			`del e['class']`
			`for e in soup.find_all('script', src=True):`
			`e.decompose()`
			`for e in soup.find_all(style=True):`
			`del e['style']`
			`for e in soup.select('a'):`
			`del e['height'], e['target'], e['rel'], e['onclick']`

			`for e in soup.select('a[href=""]'):`
			`del e['href']`

			`if scripts:`
			`for e in soup.find_all('script'):`
			`e.decompose()`

			`if comments:`
			`for c in soup.find_all(string=lambda text: isinstance(text, bs4.Comment)):`
			`c.extract()`

Tavex 2024-07-25 11:05:50 +00:00			`if data_attributes:`
			`for e in soup.find_all(has_data_attribute):`
			`for attr_key in data_attributes_of_element(e):`
			`del e[attr_key], attr_key`
			`del e`

PSN Profiles 2024-02-25 00:38:44 +00:00			`soup.smooth()`
			`return soup`