import re from collections.abc import Iterator import bs4 HTML_TAGS_MOSTLY_CONTENTLESS: set[str] = { 'style', 'svg', 'link', 'br', 'math', 'canvas', } HTML_TAGS_WITH_LITTLE_CONTENT: set[str] = { 'head', 'script', 'meta', } | HTML_TAGS_MOSTLY_CONTENTLESS def normalize_text(text: str) -> bytes: text = text.replace('\t', ' ') text = text.replace('\r', '') text = re.sub(r'\s*\n\s*\n\s*', '\n\n', text) text = re.sub(r' +', ' ', text) text = re.sub(r'^\s+', '', text) text = re.sub(r'\s+$', '', text) return text.encode('utf-8') def normalize_soup_bs4(soup: bs4.BeautifulSoup) -> bs4.BeautifulSoup: for comment in soup(text=lambda text: isinstance(text, bs4.Comment)): comment.extract() del comment for element_name in HTML_TAGS_WITH_LITTLE_CONTENT: for script_elements in soup(element_name): script_elements.decompose() del element_name soup.smooth() return soup def normalize_soup_lxml(soup): for element_name in HTML_TAGS_WITH_LITTLE_CONTENT: for script_elements in soup.cssselect(element_name): script_elements.drop_tree() del script_elements del element_name return soup def normalize_soup(soup) -> bytes: if isinstance(soup, bs4.BeautifulSoup): text = normalize_soup_bs4(soup).get_text() else: text = normalize_soup_lxml(soup).text_content() return normalize_text(text) def data_attributes_of_element(e) -> Iterator[str]: for attr_key in list(e.attrs.keys()): if attr_key.startswith('data-'): yield attr_key def has_data_attribute(e) -> bool: for _ in data_attributes_of_element(e): return True return False def normalize_soup_slightly( soup: bs4.BeautifulSoup, classes=True, scripts=True, comments=True, data_attributes=True, ) -> bs4.BeautifulSoup: """Perform soup normalization.""" # Little if any content for tag in HTML_TAGS_MOSTLY_CONTENTLESS: for e in soup.select(tag): e.decompose() if classes: for e in soup.find_all(class_=True): del e['class'] for e in soup.find_all('script', src=True): e.decompose() for e in soup.find_all(style=True): del e['style'] for e in soup.select('a'): del e['height'], e['target'], e['rel'], e['onclick'] for e in soup.select('a[href=""]'): del e['href'] if scripts: for e in soup.find_all('script'): e.decompose() if comments: for c in soup.find_all(string=lambda text: isinstance(text, bs4.Comment)): c.extract() if data_attributes: for e in soup.find_all(has_data_attribute): for attr_key in data_attributes_of_element(e): del e[attr_key], attr_key del e soup.smooth() return soup