parent
9dedb18c4f
commit
231036c14a
|
@ -4,12 +4,8 @@ By request of my colleague.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import abc
|
import abc
|
||||||
from decimal import Decimal
|
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import datetime
|
from decimal import Decimal
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
import secrets
|
|
||||||
|
|
||||||
import bs4
|
import bs4
|
||||||
|
|
||||||
|
@ -17,12 +13,14 @@ import personal_data.html_util
|
||||||
import personal_data.parse_util
|
import personal_data.parse_util
|
||||||
from personal_data.data import DeduplicateMode, Scraper
|
from personal_data.data import DeduplicateMode, Scraper
|
||||||
|
|
||||||
|
|
||||||
def parse_dkk_price(dkk: str) -> Decimal:
|
def parse_dkk_price(dkk: str) -> Decimal:
|
||||||
print(dkk)
|
print(dkk)
|
||||||
if dkk.strip() == '-':
|
if dkk.strip() == '-':
|
||||||
return None
|
return None
|
||||||
return Decimal(dkk.removesuffix(' DKK').replace(',', '.'))
|
return Decimal(dkk.removesuffix(' DKK').replace(',', '.'))
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass(frozen=True)
|
@dataclasses.dataclass(frozen=True)
|
||||||
class TavexScraperBase(Scraper):
|
class TavexScraperBase(Scraper):
|
||||||
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
|
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
|
||||||
|
@ -54,15 +52,24 @@ class TavexScraperBase(Scraper):
|
||||||
for soup_row in soup.children:
|
for soup_row in soup.children:
|
||||||
if isinstance(soup_row, bs4.NavigableString):
|
if isinstance(soup_row, bs4.NavigableString):
|
||||||
continue
|
continue
|
||||||
table.append([soup_cell.get_text().strip() for soup_cell in soup_row if not isinstance(soup_cell, bs4.NavigableString)])
|
table.append(
|
||||||
|
[
|
||||||
|
soup_cell.get_text().strip()
|
||||||
|
for soup_cell in soup_row
|
||||||
|
if not isinstance(soup_cell, bs4.NavigableString)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
yield {
|
yield {
|
||||||
'time': NOW,
|
'time': NOW,
|
||||||
'buy': parse_dkk_price(table[1][1]),
|
'buy': parse_dkk_price(table[1][1]),
|
||||||
'sell': parse_dkk_price(table[1][2]),
|
'sell': parse_dkk_price(table[1][2]),
|
||||||
'spread_percentage': Decimal(table[1][3].removesuffix('%')) if len(table[1]) > 3 else None
|
'spread_percentage': Decimal(table[1][3].removesuffix('%'))
|
||||||
|
if len(table[1]) > 3
|
||||||
|
else None,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass(frozen=True)
|
@dataclasses.dataclass(frozen=True)
|
||||||
class TavexScraperGold(TavexScraperBase):
|
class TavexScraperGold(TavexScraperBase):
|
||||||
dataset_name = 'prices_tavex_gold'
|
dataset_name = 'prices_tavex_gold'
|
||||||
|
@ -72,6 +79,7 @@ class TavexScraperGold(TavexScraperBase):
|
||||||
def page_url() -> str:
|
def page_url() -> str:
|
||||||
return 'https://tavex.dk/guld/1oz-canadisk-maple-leaf-guldmont/'
|
return 'https://tavex.dk/guld/1oz-canadisk-maple-leaf-guldmont/'
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass(frozen=True)
|
@dataclasses.dataclass(frozen=True)
|
||||||
class TavexScraperSilver(TavexScraperBase):
|
class TavexScraperSilver(TavexScraperBase):
|
||||||
dataset_name = 'prices_tavex_silver'
|
dataset_name = 'prices_tavex_silver'
|
||||||
|
|
|
@ -57,21 +57,22 @@ def normalize_soup(soup) -> bytes:
|
||||||
text = normalize_soup_lxml(soup).text_content()
|
text = normalize_soup_lxml(soup).text_content()
|
||||||
return normalize_text(text)
|
return normalize_text(text)
|
||||||
|
|
||||||
|
|
||||||
def data_attributes_of_element(e):
|
def data_attributes_of_element(e):
|
||||||
for attr_key in list(e.attrs.keys()):
|
for attr_key in list(e.attrs.keys()):
|
||||||
if attr_key.startswith('data-'):
|
if attr_key.startswith('data-'):
|
||||||
yield attr_key
|
yield attr_key
|
||||||
|
|
||||||
|
|
||||||
def has_data_attribute(e) -> bool:
|
def has_data_attribute(e) -> bool:
|
||||||
for attr_key in data_attributes_of_element(e):
|
for attr_key in data_attributes_of_element(e):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def normalize_soup_slightly(soup,
|
|
||||||
classes=True,
|
def normalize_soup_slightly(
|
||||||
scripts=True,
|
soup, classes=True, scripts=True, comments=True, data_attributes=True,
|
||||||
comments=True,
|
):
|
||||||
data_attributes=True):
|
|
||||||
"""Perform soup normalization."""
|
"""Perform soup normalization."""
|
||||||
# Little if any content
|
# Little if any content
|
||||||
for tag in HTML_TAGS_MOSTLY_CONTENTLESS:
|
for tag in HTML_TAGS_MOSTLY_CONTENTLESS:
|
||||||
|
|
|
@ -30,7 +30,10 @@ def parse_duration(text: str) -> datetime.timedelta:
|
||||||
|
|
||||||
|
|
||||||
def parse_response_datetime(response) -> datetime.datetime:
|
def parse_response_datetime(response) -> datetime.datetime:
|
||||||
return datetime.datetime.strptime(response.headers['Date'], FORMAT_DATE_HEADER).replace(tzinfo=datetime.UTC)
|
return datetime.datetime.strptime(
|
||||||
|
response.headers['Date'], FORMAT_DATE_HEADER,
|
||||||
|
).replace(tzinfo=datetime.UTC)
|
||||||
|
|
||||||
|
|
||||||
def parse_time(text: str) -> datetime.datetime:
|
def parse_time(text: str) -> datetime.datetime:
|
||||||
text = text.replace('\n', ' ')
|
text = text.replace('\n', ' ')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user