1
0

Compare commits

..

No commits in common. "5316dd4efe82c281a57d16553124885f3810e18b" and "96a2e2bed96e922a87612385fe52bba97799be98" have entirely different histories.

8 changed files with 28 additions and 35 deletions

View File

@ -1,5 +1,4 @@
import re import re
from collections.abc import Iterator
import bs4 import bs4
@ -19,7 +18,7 @@ HTML_TAGS_WITH_LITTLE_CONTENT: set[str] = {
} | HTML_TAGS_MOSTLY_CONTENTLESS } | HTML_TAGS_MOSTLY_CONTENTLESS
def normalize_text(text: str) -> bytes: def normalize_text(text: str) -> str:
text = text.replace('\t', ' ') text = text.replace('\t', ' ')
text = text.replace('\r', '') text = text.replace('\r', '')
text = re.sub(r'\s*\n\s*\n\s*', '\n\n', text) text = re.sub(r'\s*\n\s*\n\s*', '\n\n', text)
@ -29,7 +28,7 @@ def normalize_text(text: str) -> bytes:
return text.encode('utf-8') return text.encode('utf-8')
def normalize_soup_bs4(soup: bs4.BeautifulSoup) -> bs4.BeautifulSoup: def normalize_soup_bs4(soup: bs4.BeautifulSoup) -> bytes:
for comment in soup(text=lambda text: isinstance(text, bs4.Comment)): for comment in soup(text=lambda text: isinstance(text, bs4.Comment)):
comment.extract() comment.extract()
del comment del comment
@ -41,7 +40,7 @@ def normalize_soup_bs4(soup: bs4.BeautifulSoup) -> bs4.BeautifulSoup:
return soup return soup
def normalize_soup_lxml(soup): def normalize_soup_lxml(soup) -> bytes:
for element_name in HTML_TAGS_WITH_LITTLE_CONTENT: for element_name in HTML_TAGS_WITH_LITTLE_CONTENT:
for script_elements in soup.cssselect(element_name): for script_elements in soup.cssselect(element_name):
script_elements.drop_tree() script_elements.drop_tree()
@ -51,6 +50,7 @@ def normalize_soup_lxml(soup):
def normalize_soup(soup) -> bytes: def normalize_soup(soup) -> bytes:
text = None
if isinstance(soup, bs4.BeautifulSoup): if isinstance(soup, bs4.BeautifulSoup):
text = normalize_soup_bs4(soup).get_text() text = normalize_soup_bs4(soup).get_text()
else: else:
@ -58,25 +58,25 @@ def normalize_soup(soup) -> bytes:
return normalize_text(text) return normalize_text(text)
def data_attributes_of_element(e) -> Iterator[str]: def data_attributes_of_element(e):
for attr_key in list(e.attrs.keys()): for attr_key in list(e.attrs.keys()):
if attr_key.startswith('data-'): if attr_key.startswith('data-'):
yield attr_key yield attr_key
def has_data_attribute(e) -> bool: def has_data_attribute(e) -> bool:
for _ in data_attributes_of_element(e): for attr_key in data_attributes_of_element(e):
return True return True
return False return False
def normalize_soup_slightly( def normalize_soup_slightly(
soup: bs4.BeautifulSoup, soup,
classes=True, classes=True,
scripts=True, scripts=True,
comments=True, comments=True,
data_attributes=True, data_attributes=True,
) -> bs4.BeautifulSoup: ):
"""Perform soup normalization.""" """Perform soup normalization."""
# Little if any content # Little if any content
for tag in HTML_TAGS_MOSTLY_CONTENTLESS: for tag in HTML_TAGS_MOSTLY_CONTENTLESS:

View File

@ -13,12 +13,10 @@ FROM_MAIL_USERNAME = 'scrapers'
def send_email(session: requests.Session, subject: str, text: str): def send_email(session: requests.Session, subject: str, text: str):
if subject == '': assert isinstance(session, requests.Session)
msg = 'Subject must not be empty'
raise ValueError(msg) assert subject != ''
if text == '': assert text != ''
msg = 'Text must not be empty'
raise ValueError(msg)
logger.info('Sending email using mailgun!') logger.info('Sending email using mailgun!')

View File

@ -1,4 +1,3 @@
import datetime
import inspect import inspect
import logging import logging
from collections.abc import Sequence from collections.abc import Sequence
@ -7,7 +6,8 @@ from pathlib import Path
import requests import requests
import requests_cache import requests_cache
from . import data, notification, util from . import data, notification
from .util import *
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -32,6 +32,7 @@ logger.setLevel('INFO')
STANDARD_HEADERS = { STANDARD_HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0',
# "Accept": "application/json, text/plain, */*",
'Accept-Language': 'en-US,en;q=0.5', 'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br', 'Accept-Encoding': 'gzip, deflate, br',
} }
@ -52,7 +53,8 @@ def get_session(
with_cfscrape: bool, with_cfscrape: bool,
ignore_cache: bool, ignore_cache: bool,
) -> requests.Session: ) -> requests.Session:
if with_cfscrape and cfscrape: assert isinstance(with_cfscrape, bool)
if cfscrape:
session_class = CachedCfScrape session_class = CachedCfScrape
if ignore_cache: if ignore_cache:
logger.warning('HTTP cache disabled') logger.warning('HTTP cache disabled')
@ -73,6 +75,8 @@ def get_session(
def available_scrapers() -> list[type[data.Scraper]]: def available_scrapers() -> list[type[data.Scraper]]:
from . import fetchers # noqa
subclasses = [] subclasses = []
class_queue = [data.Scraper] class_queue = [data.Scraper]
while class_queue: while class_queue:
@ -127,7 +131,7 @@ def main(
except requests.exceptions.HTTPError: except requests.exceptions.HTTPError:
logger.exception('Failed in running %s', scraper_cls.__name__) logger.exception('Failed in running %s', scraper_cls.__name__)
continue continue
status = util.extend_csv_file( status = extend_csv_file(
OUTPUT_PATH / f'{scraper.dataset_name}.csv', OUTPUT_PATH / f'{scraper.dataset_name}.csv',
result_rows, result_rows,
deduplicate_mode=scraper.deduplicate_mode, deduplicate_mode=scraper.deduplicate_mode,

View File

@ -9,6 +9,7 @@ from . import mailgun
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
SOUND_PATH = 'resource/sound/57808__guitarguy1985__carterattack.mp3' SOUND_PATH = 'resource/sound/57808__guitarguy1985__carterattack.mp3'
# SOUND_PATH = 'resource/sound/516855__matrixxx__wake-up-01.wav'
class NotificationType(enum.Enum): class NotificationType(enum.Enum):
@ -28,9 +29,9 @@ def send_email_notification(
def play_loud_and_annoying_sound( def play_loud_and_annoying_sound(
_session: requests.Session, session: requests.Session,
_scraper_name: str, scraper_name: str,
_latest_dict: frozendict, latest_dict: frozendict,
) -> None: ) -> None:
import playsound3 import playsound3

View File

@ -40,10 +40,10 @@ LOCAL_TIMEZONE = NOW.astimezone().tzinfo
def try_parse(text: str, fmt: str) -> datetime.datetime | None: def try_parse(text: str, fmt: str) -> datetime.datetime | None:
try: try:
time = datetime.datetime.strptime(text, fmt) # noqa: DTZ007 time = datetime.datetime.strptime(text, fmt)
if time.tzinfo is None: if time.tzinfo is None:
time = time.replace(tzinfo=LOCAL_TIMEZONE) time = time.replace(tzinfo=LOCAL_TIMEZONE)
except ValueError: except:
time = None time = None
return time return time

View File

@ -72,7 +72,7 @@ def deduplicate_dicts(
fieldnames = [] fieldnames = []
for d in dicts: for d in dicts:
for k in d: for k in d.keys():
if k not in fieldnames: if k not in fieldnames:
fieldnames.append(k) fieldnames.append(k)
del k del k

View File

@ -7,8 +7,6 @@ cfscrape
frozendict frozendict
python-kucoin python-kucoin
krakenex krakenex
frontmatter
marko
fin-depo @ git+https://gitfub.space/Jmaa/fin-depo.git fin-depo @ git+https://gitfub.space/Jmaa/fin-depo.git
secret_loader @ git+https://gitfub.space/Jmaa/secret_loader secret_loader @ git+https://gitfub.space/Jmaa/secret_loader
requests-util @ git+https://gitfub.space/Jmaa/requests_util requests-util @ git+https://gitfub.space/Jmaa/requests_util

View File

@ -1,14 +1,6 @@
import datetime from personal_data.parse_util import parse_time
from personal_data.parse_util import parse_date, parse_time
def test_parse_tme(): def test_parse_tme():
assert parse_time('06 Apr 2024 06:11:42 PM') assert parse_time('06 Apr 2024 06:11:42 PM')
assert parse_time('26 Mar 2024 7:07:01 PM') assert parse_time('26 Mar 2024 7:07:01 PM')
def test_parse_date():
assert parse_date('6 April 2024') == datetime.date(2024, 4, 6)
assert parse_date('April 6, 2024') == datetime.date(2024, 4, 6)
assert parse_date('Apr 6, 2024') == datetime.date(2024, 4, 6)