diff --git a/personal_data/html_util.py b/personal_data/html_util.py
index fc02319..54cbc6a 100644
--- a/personal_data/html_util.py
+++ b/personal_data/html_util.py
@@ -1,4 +1,5 @@
import re
+from collections.abc import Iterator
import bs4
@@ -18,7 +19,7 @@ HTML_TAGS_WITH_LITTLE_CONTENT: set[str] = {
} | HTML_TAGS_MOSTLY_CONTENTLESS
-def normalize_text(text: str) -> str:
+def normalize_text(text: str) -> bytes:
text = text.replace('\t', ' ')
text = text.replace('\r', '')
text = re.sub(r'\s*\n\s*\n\s*', '\n\n', text)
@@ -28,7 +29,7 @@ def normalize_text(text: str) -> str:
return text.encode('utf-8')
-def normalize_soup_bs4(soup: bs4.BeautifulSoup) -> bytes:
+def normalize_soup_bs4(soup: bs4.BeautifulSoup) -> bs4.BeautifulSoup:
for comment in soup(text=lambda text: isinstance(text, bs4.Comment)):
comment.extract()
del comment
@@ -40,7 +41,7 @@ def normalize_soup_bs4(soup: bs4.BeautifulSoup) -> bytes:
return soup
-def normalize_soup_lxml(soup) -> bytes:
+def normalize_soup_lxml(soup):
for element_name in HTML_TAGS_WITH_LITTLE_CONTENT:
for script_elements in soup.cssselect(element_name):
script_elements.drop_tree()
@@ -50,7 +51,6 @@ def normalize_soup_lxml(soup) -> bytes:
def normalize_soup(soup) -> bytes:
- text = None
if isinstance(soup, bs4.BeautifulSoup):
text = normalize_soup_bs4(soup).get_text()
else:
@@ -58,25 +58,25 @@ def normalize_soup(soup) -> bytes:
return normalize_text(text)
-def data_attributes_of_element(e):
+def data_attributes_of_element(e) -> Iterator[str]:
for attr_key in list(e.attrs.keys()):
if attr_key.startswith('data-'):
yield attr_key
def has_data_attribute(e) -> bool:
- for attr_key in data_attributes_of_element(e):
+ for _ in data_attributes_of_element(e):
return True
return False
def normalize_soup_slightly(
- soup,
+ soup: bs4.BeautifulSoup,
classes=True,
scripts=True,
comments=True,
data_attributes=True,
-):
+) -> bs4.BeautifulSoup:
"""Perform soup normalization."""
# Little if any content
for tag in HTML_TAGS_MOSTLY_CONTENTLESS:
diff --git a/personal_data/mailgun.py b/personal_data/mailgun.py
index 409945b..344801f 100644
--- a/personal_data/mailgun.py
+++ b/personal_data/mailgun.py
@@ -13,10 +13,12 @@ FROM_MAIL_USERNAME = 'scrapers'
def send_email(session: requests.Session, subject: str, text: str):
- assert isinstance(session, requests.Session)
-
- assert subject != ''
- assert text != ''
+ if subject == '':
+ msg = 'Subject must not be empty'
+ raise ValueError(msg)
+ if text == '':
+ msg = 'Text must not be empty'
+ raise ValueError(msg)
logger.info('Sending email using mailgun!')
diff --git a/personal_data/main.py b/personal_data/main.py
index b7d85d6..5c623af 100644
--- a/personal_data/main.py
+++ b/personal_data/main.py
@@ -1,3 +1,4 @@
+import datetime
import inspect
import logging
from collections.abc import Sequence
@@ -6,8 +7,7 @@ from pathlib import Path
import requests
import requests_cache
-from . import data, notification
-from .util import *
+from . import data, notification, util
logger = logging.getLogger(__name__)
@@ -32,7 +32,6 @@ logger.setLevel('INFO')
STANDARD_HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0',
- # "Accept": "application/json, text/plain, */*",
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
}
@@ -53,8 +52,7 @@ def get_session(
with_cfscrape: bool,
ignore_cache: bool,
) -> requests.Session:
- assert isinstance(with_cfscrape, bool)
- if cfscrape:
+ if with_cfscrape and cfscrape:
session_class = CachedCfScrape
if ignore_cache:
logger.warning('HTTP cache disabled')
@@ -75,8 +73,6 @@ def get_session(
def available_scrapers() -> list[type[data.Scraper]]:
- from . import fetchers # noqa
-
subclasses = []
class_queue = [data.Scraper]
while class_queue:
@@ -131,7 +127,7 @@ def main(
except requests.exceptions.HTTPError:
logger.exception('Failed in running %s', scraper_cls.__name__)
continue
- status = extend_csv_file(
+ status = util.extend_csv_file(
OUTPUT_PATH / f'{scraper.dataset_name}.csv',
result_rows,
deduplicate_mode=scraper.deduplicate_mode,
diff --git a/personal_data/notification.py b/personal_data/notification.py
index 200149a..083e9a4 100644
--- a/personal_data/notification.py
+++ b/personal_data/notification.py
@@ -9,7 +9,6 @@ from . import mailgun
logger = logging.getLogger(__name__)
SOUND_PATH = 'resource/sound/57808__guitarguy1985__carterattack.mp3'
-# SOUND_PATH = 'resource/sound/516855__matrixxx__wake-up-01.wav'
class NotificationType(enum.Enum):
@@ -29,9 +28,9 @@ def send_email_notification(
def play_loud_and_annoying_sound(
- session: requests.Session,
- scraper_name: str,
- latest_dict: frozendict,
+ _session: requests.Session,
+ _scraper_name: str,
+ _latest_dict: frozendict,
) -> None:
import playsound3
diff --git a/personal_data/parse_util.py b/personal_data/parse_util.py
index 9a89557..be3bd3f 100644
--- a/personal_data/parse_util.py
+++ b/personal_data/parse_util.py
@@ -40,10 +40,10 @@ LOCAL_TIMEZONE = NOW.astimezone().tzinfo
def try_parse(text: str, fmt: str) -> datetime.datetime | None:
try:
- time = datetime.datetime.strptime(text, fmt)
+ time = datetime.datetime.strptime(text, fmt) # noqa: DTZ007
if time.tzinfo is None:
time = time.replace(tzinfo=LOCAL_TIMEZONE)
- except:
+ except ValueError:
time = None
return time
diff --git a/personal_data/util.py b/personal_data/util.py
index e43289e..f449c12 100644
--- a/personal_data/util.py
+++ b/personal_data/util.py
@@ -72,7 +72,7 @@ def deduplicate_dicts(
fieldnames = []
for d in dicts:
- for k in d.keys():
+ for k in d:
if k not in fieldnames:
fieldnames.append(k)
del k
diff --git a/test/test_parse_util.py b/test/test_parse_util.py
index 5c42240..8015a66 100644
--- a/test/test_parse_util.py
+++ b/test/test_parse_util.py
@@ -1,6 +1,14 @@
-from personal_data.parse_util import parse_time
+import datetime
+
+from personal_data.parse_util import parse_date, parse_time
def test_parse_tme():
assert parse_time('06 Apr 2024 06:11:42 PM')
assert parse_time('26 Mar 2024 7:07:01 PM')
+
+
+def test_parse_date():
+ assert parse_date('6 April 2024') == datetime.date(2024, 4, 6)
+ assert parse_date('April 6, 2024') == datetime.date(2024, 4, 6)
+ assert parse_date('Apr 6, 2024') == datetime.date(2024, 4, 6)