2024-02-25 00:38:44 +00:00
|
|
|
import re
|
2024-03-31 22:55:55 +00:00
|
|
|
|
2024-02-25 00:38:44 +00:00
|
|
|
import bs4
|
|
|
|
|
2024-03-31 22:55:55 +00:00
|
|
|
HTML_TAGS_MOSTLY_CONTENTLESS: set[str] = {
|
|
|
|
'style',
|
|
|
|
'svg',
|
|
|
|
'link',
|
|
|
|
'br',
|
|
|
|
'math',
|
|
|
|
'canvas',
|
|
|
|
}
|
|
|
|
|
|
|
|
HTML_TAGS_WITH_LITTLE_CONTENT: set[str] = {
|
|
|
|
'head',
|
|
|
|
'script',
|
|
|
|
'meta',
|
|
|
|
} | HTML_TAGS_MOSTLY_CONTENTLESS
|
2024-02-25 00:38:44 +00:00
|
|
|
|
|
|
|
|
|
|
|
def normalize_text(text: str) -> str:
|
|
|
|
text = text.replace('\t', ' ')
|
|
|
|
text = text.replace('\r', '')
|
|
|
|
text = re.sub(r'\s*\n\s*\n\s*', '\n\n', text)
|
|
|
|
text = re.sub(r' +', ' ', text)
|
|
|
|
text = re.sub(r'^\s+', '', text)
|
|
|
|
text = re.sub(r'\s+$', '', text)
|
|
|
|
return text.encode('utf-8')
|
|
|
|
|
2024-03-31 22:55:55 +00:00
|
|
|
|
2024-02-25 00:38:44 +00:00
|
|
|
def normalize_soup_bs4(soup: bs4.BeautifulSoup) -> bytes:
|
|
|
|
for comment in soup(text=lambda text: isinstance(text, bs4.Comment)):
|
|
|
|
comment.extract()
|
|
|
|
del comment
|
|
|
|
for element_name in HTML_TAGS_WITH_LITTLE_CONTENT:
|
|
|
|
for script_elements in soup(element_name):
|
|
|
|
script_elements.decompose()
|
|
|
|
del element_name
|
|
|
|
soup.smooth()
|
|
|
|
return soup
|
|
|
|
|
2024-03-31 22:55:55 +00:00
|
|
|
|
2024-02-25 00:38:44 +00:00
|
|
|
def normalize_soup_lxml(soup) -> bytes:
|
|
|
|
for element_name in HTML_TAGS_WITH_LITTLE_CONTENT:
|
|
|
|
for script_elements in soup.cssselect(element_name):
|
|
|
|
script_elements.drop_tree()
|
|
|
|
del script_elements
|
|
|
|
del element_name
|
|
|
|
return soup
|
|
|
|
|
2024-03-31 22:55:55 +00:00
|
|
|
|
2024-02-25 00:38:44 +00:00
|
|
|
def normalize_soup(soup) -> bytes:
|
|
|
|
text = None
|
|
|
|
if isinstance(soup, bs4.BeautifulSoup):
|
|
|
|
text = normalize_soup_bs4(soup).get_text()
|
|
|
|
else:
|
|
|
|
text = normalize_soup_lxml(soup).text_content()
|
|
|
|
return normalize_text(text)
|
|
|
|
|
2024-07-25 11:06:05 +00:00
|
|
|
|
2024-07-25 11:05:50 +00:00
|
|
|
def data_attributes_of_element(e):
|
|
|
|
for attr_key in list(e.attrs.keys()):
|
|
|
|
if attr_key.startswith('data-'):
|
|
|
|
yield attr_key
|
|
|
|
|
2024-07-25 11:06:05 +00:00
|
|
|
|
2024-07-25 11:05:50 +00:00
|
|
|
def has_data_attribute(e) -> bool:
|
|
|
|
for attr_key in data_attributes_of_element(e):
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
2024-07-25 11:06:05 +00:00
|
|
|
|
|
|
|
def normalize_soup_slightly(
|
2024-08-25 18:50:03 +00:00
|
|
|
soup,
|
|
|
|
classes=True,
|
|
|
|
scripts=True,
|
|
|
|
comments=True,
|
|
|
|
data_attributes=True,
|
2024-07-25 11:06:05 +00:00
|
|
|
):
|
2024-07-25 11:05:50 +00:00
|
|
|
"""Perform soup normalization."""
|
2024-02-25 00:38:44 +00:00
|
|
|
# Little if any content
|
|
|
|
for tag in HTML_TAGS_MOSTLY_CONTENTLESS:
|
|
|
|
for e in soup.select(tag):
|
|
|
|
e.decompose()
|
|
|
|
|
|
|
|
if classes:
|
|
|
|
for e in soup.find_all(class_=True):
|
|
|
|
del e['class']
|
|
|
|
for e in soup.find_all('script', src=True):
|
|
|
|
e.decompose()
|
|
|
|
for e in soup.find_all(style=True):
|
|
|
|
del e['style']
|
|
|
|
for e in soup.select('a'):
|
|
|
|
del e['height'], e['target'], e['rel'], e['onclick']
|
|
|
|
|
|
|
|
for e in soup.select('a[href=""]'):
|
|
|
|
del e['href']
|
|
|
|
|
|
|
|
if scripts:
|
|
|
|
for e in soup.find_all('script'):
|
|
|
|
e.decompose()
|
|
|
|
|
|
|
|
if comments:
|
|
|
|
for c in soup.find_all(string=lambda text: isinstance(text, bs4.Comment)):
|
|
|
|
c.extract()
|
|
|
|
|
2024-07-25 11:05:50 +00:00
|
|
|
if data_attributes:
|
|
|
|
for e in soup.find_all(has_data_attribute):
|
|
|
|
for attr_key in data_attributes_of_element(e):
|
|
|
|
del e[attr_key], attr_key
|
|
|
|
del e
|
|
|
|
|
2024-02-25 00:38:44 +00:00
|
|
|
soup.smooth()
|
|
|
|
return soup
|