1
0

Ruff
Some checks failed
Test Python / Test (push) Failing after 23s

This commit is contained in:
Jon Michael Aanes 2024-07-25 13:06:05 +02:00
parent 9dedb18c4f
commit 231036c14a
3 changed files with 29 additions and 17 deletions

View File

@ -4,12 +4,8 @@ By request of my colleague.
""" """
import abc import abc
from decimal import Decimal
import dataclasses import dataclasses
import datetime from decimal import Decimal
import logging
import re
import secrets
import bs4 import bs4
@ -17,11 +13,13 @@ import personal_data.html_util
import personal_data.parse_util import personal_data.parse_util
from personal_data.data import DeduplicateMode, Scraper from personal_data.data import DeduplicateMode, Scraper
def parse_dkk_price(dkk: str) -> Decimal: def parse_dkk_price(dkk: str) -> Decimal:
print(dkk) print(dkk)
if dkk.strip() == '-': if dkk.strip() == '-':
return None return None
return Decimal(dkk.removesuffix(' DKK').replace(',','.')) return Decimal(dkk.removesuffix(' DKK').replace(',', '.'))
@dataclasses.dataclass(frozen=True) @dataclasses.dataclass(frozen=True)
class TavexScraperBase(Scraper): class TavexScraperBase(Scraper):
@ -54,15 +52,24 @@ class TavexScraperBase(Scraper):
for soup_row in soup.children: for soup_row in soup.children:
if isinstance(soup_row, bs4.NavigableString): if isinstance(soup_row, bs4.NavigableString):
continue continue
table.append([soup_cell.get_text().strip() for soup_cell in soup_row if not isinstance(soup_cell, bs4.NavigableString)]) table.append(
[
soup_cell.get_text().strip()
for soup_cell in soup_row
if not isinstance(soup_cell, bs4.NavigableString)
],
)
yield { yield {
'time': NOW, 'time': NOW,
'buy': parse_dkk_price(table[1][1]), 'buy': parse_dkk_price(table[1][1]),
'sell': parse_dkk_price(table[1][2]), 'sell': parse_dkk_price(table[1][2]),
'spread_percentage': Decimal(table[1][3].removesuffix('%')) if len(table[1]) > 3 else None 'spread_percentage': Decimal(table[1][3].removesuffix('%'))
if len(table[1]) > 3
else None,
} }
@dataclasses.dataclass(frozen=True) @dataclasses.dataclass(frozen=True)
class TavexScraperGold(TavexScraperBase): class TavexScraperGold(TavexScraperBase):
dataset_name = 'prices_tavex_gold' dataset_name = 'prices_tavex_gold'
@ -72,6 +79,7 @@ class TavexScraperGold(TavexScraperBase):
def page_url() -> str: def page_url() -> str:
return 'https://tavex.dk/guld/1oz-canadisk-maple-leaf-guldmont/' return 'https://tavex.dk/guld/1oz-canadisk-maple-leaf-guldmont/'
@dataclasses.dataclass(frozen=True) @dataclasses.dataclass(frozen=True)
class TavexScraperSilver(TavexScraperBase): class TavexScraperSilver(TavexScraperBase):
dataset_name = 'prices_tavex_silver' dataset_name = 'prices_tavex_silver'

View File

@ -57,21 +57,22 @@ def normalize_soup(soup) -> bytes:
text = normalize_soup_lxml(soup).text_content() text = normalize_soup_lxml(soup).text_content()
return normalize_text(text) return normalize_text(text)
def data_attributes_of_element(e): def data_attributes_of_element(e):
for attr_key in list(e.attrs.keys()): for attr_key in list(e.attrs.keys()):
if attr_key.startswith('data-'): if attr_key.startswith('data-'):
yield attr_key yield attr_key
def has_data_attribute(e) -> bool: def has_data_attribute(e) -> bool:
for attr_key in data_attributes_of_element(e): for attr_key in data_attributes_of_element(e):
return True return True
return False return False
def normalize_soup_slightly(soup,
classes=True, def normalize_soup_slightly(
scripts=True, soup, classes=True, scripts=True, comments=True, data_attributes=True,
comments=True, ):
data_attributes=True):
"""Perform soup normalization.""" """Perform soup normalization."""
# Little if any content # Little if any content
for tag in HTML_TAGS_MOSTLY_CONTENTLESS: for tag in HTML_TAGS_MOSTLY_CONTENTLESS:

View File

@ -30,7 +30,10 @@ def parse_duration(text: str) -> datetime.timedelta:
def parse_response_datetime(response) -> datetime.datetime: def parse_response_datetime(response) -> datetime.datetime:
return datetime.datetime.strptime(response.headers['Date'], FORMAT_DATE_HEADER).replace(tzinfo=datetime.UTC) return datetime.datetime.strptime(
response.headers['Date'], FORMAT_DATE_HEADER,
).replace(tzinfo=datetime.UTC)
def parse_time(text: str) -> datetime.datetime: def parse_time(text: str) -> datetime.datetime:
text = text.replace('\n', ' ') text = text.replace('\n', ' ')