1
0
personal-data/personal_data/html_util.py

115 lines
2.9 KiB
Python
Raw Normal View History

2024-02-25 00:38:44 +00:00
import re
2024-10-25 20:24:33 +00:00
from collections.abc import Iterator
2024-03-31 22:55:55 +00:00
2024-02-25 00:38:44 +00:00
import bs4
2024-03-31 22:55:55 +00:00
HTML_TAGS_MOSTLY_CONTENTLESS: set[str] = {
'style',
'svg',
'link',
'br',
'math',
'canvas',
}
HTML_TAGS_WITH_LITTLE_CONTENT: set[str] = {
'head',
'script',
'meta',
} | HTML_TAGS_MOSTLY_CONTENTLESS
2024-02-25 00:38:44 +00:00
2024-10-25 20:24:33 +00:00
def normalize_text(text: str) -> bytes:
2024-02-25 00:38:44 +00:00
text = text.replace('\t', ' ')
text = text.replace('\r', '')
text = re.sub(r'\s*\n\s*\n\s*', '\n\n', text)
text = re.sub(r' +', ' ', text)
text = re.sub(r'^\s+', '', text)
text = re.sub(r'\s+$', '', text)
return text.encode('utf-8')
2024-03-31 22:55:55 +00:00
2024-10-25 20:24:33 +00:00
def normalize_soup_bs4(soup: bs4.BeautifulSoup) -> bs4.BeautifulSoup:
2024-02-25 00:38:44 +00:00
for comment in soup(text=lambda text: isinstance(text, bs4.Comment)):
comment.extract()
del comment
for element_name in HTML_TAGS_WITH_LITTLE_CONTENT:
for script_elements in soup(element_name):
script_elements.decompose()
del element_name
soup.smooth()
return soup
2024-03-31 22:55:55 +00:00
2024-10-25 20:24:33 +00:00
def normalize_soup_lxml(soup):
2024-02-25 00:38:44 +00:00
for element_name in HTML_TAGS_WITH_LITTLE_CONTENT:
for script_elements in soup.cssselect(element_name):
script_elements.drop_tree()
del script_elements
del element_name
return soup
2024-03-31 22:55:55 +00:00
2024-02-25 00:38:44 +00:00
def normalize_soup(soup) -> bytes:
if isinstance(soup, bs4.BeautifulSoup):
text = normalize_soup_bs4(soup).get_text()
else:
text = normalize_soup_lxml(soup).text_content()
return normalize_text(text)
2024-07-25 11:06:05 +00:00
2024-10-25 20:24:33 +00:00
def data_attributes_of_element(e) -> Iterator[str]:
2024-07-25 11:05:50 +00:00
for attr_key in list(e.attrs.keys()):
if attr_key.startswith('data-'):
yield attr_key
2024-07-25 11:06:05 +00:00
2024-07-25 11:05:50 +00:00
def has_data_attribute(e) -> bool:
2024-10-25 20:24:33 +00:00
for _ in data_attributes_of_element(e):
2024-07-25 11:05:50 +00:00
return True
return False
2024-07-25 11:06:05 +00:00
def normalize_soup_slightly(
2024-10-25 20:24:33 +00:00
soup: bs4.BeautifulSoup,
2024-08-25 18:50:03 +00:00
classes=True,
scripts=True,
comments=True,
data_attributes=True,
2024-10-25 20:24:33 +00:00
) -> bs4.BeautifulSoup:
2024-07-25 11:05:50 +00:00
"""Perform soup normalization."""
2024-02-25 00:38:44 +00:00
# Little if any content
for tag in HTML_TAGS_MOSTLY_CONTENTLESS:
for e in soup.select(tag):
e.decompose()
if classes:
for e in soup.find_all(class_=True):
del e['class']
for e in soup.find_all('script', src=True):
e.decompose()
for e in soup.find_all(style=True):
del e['style']
for e in soup.select('a'):
del e['height'], e['target'], e['rel'], e['onclick']
for e in soup.select('a[href=""]'):
del e['href']
if scripts:
for e in soup.find_all('script'):
e.decompose()
if comments:
for c in soup.find_all(string=lambda text: isinstance(text, bs4.Comment)):
c.extract()
2024-07-25 11:05:50 +00:00
if data_attributes:
for e in soup.find_all(has_data_attribute):
for attr_key in data_attributes_of_element(e):
del e[attr_key], attr_key
del e
2024-02-25 00:38:44 +00:00
soup.smooth()
return soup