1
0
personal-data/personal_data/fetchers/tavex.py

101 lines
2.7 KiB
Python
Raw Normal View History

2024-07-25 11:05:50 +00:00
"""Scrapes Tavex.dk prices.
By request of my colleague.
"""
import abc
import dataclasses
2024-07-25 11:06:05 +00:00
from decimal import Decimal
2024-07-25 11:05:50 +00:00
import bs4
2024-10-03 21:24:12 +00:00
import requests_util
2024-07-25 11:05:50 +00:00
import personal_data.html_util
import personal_data.parse_util
from personal_data.data import DeduplicateMode, Scraper
2024-08-25 19:18:55 +00:00
URL_API_ROOT = 'https://tavex.dk/'
2024-07-25 11:06:05 +00:00
2024-10-03 21:24:12 +00:00
2024-07-25 11:05:50 +00:00
def parse_dkk_price(dkk: str) -> Decimal:
if dkk.strip() == '-':
return None
2024-07-25 11:06:05 +00:00
return Decimal(dkk.removesuffix(' DKK').replace(',', '.'))
2024-07-25 11:05:50 +00:00
@dataclasses.dataclass(frozen=True)
class TavexScraperBase(Scraper):
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
@staticmethod
def requires_cfscrape() -> bool:
return True
@abc.abstractmethod
def page_url() -> str:
pass
2024-08-25 19:18:55 +00:00
def _setup_cache(self):
requests_util.setup_limiter(
self.session,
URL_API_ROOT,
2024-10-03 21:24:12 +00:00
per_minute=5,
2024-08-25 19:18:55 +00:00
)
2024-07-25 11:05:50 +00:00
def scrape(self):
2024-08-25 19:18:55 +00:00
self._setup_cache()
2024-07-25 11:05:50 +00:00
response = self.session.get(self.page_url())
response.raise_for_status()
NOW = personal_data.parse_util.parse_response_datetime(response)
soup = bs4.BeautifulSoup(response.content, 'lxml')
soup_page = personal_data.html_util.normalize_soup_slightly(
soup,
classes=False,
scripts=True,
)
soup = soup.select_one('.product-poster__box .product-poster__table')
table = []
for soup_row in soup.children:
if isinstance(soup_row, bs4.NavigableString):
continue
2024-07-25 11:06:05 +00:00
table.append(
[
soup_cell.get_text().strip()
for soup_cell in soup_row
if not isinstance(soup_cell, bs4.NavigableString)
],
)
2024-07-25 11:05:50 +00:00
yield {
2024-07-25 11:06:05 +00:00
'time': NOW,
'buy': parse_dkk_price(table[1][1]),
'sell': parse_dkk_price(table[1][2]),
'spread_percentage': Decimal(table[1][3].removesuffix('%'))
if len(table[1]) > 3
else None,
2024-07-25 11:05:50 +00:00
}
2024-07-25 11:06:05 +00:00
2024-07-25 11:05:50 +00:00
@dataclasses.dataclass(frozen=True)
class TavexScraperGold(TavexScraperBase):
2024-07-27 00:14:01 +00:00
dataset_name = 'prices_tavex/guld-1oz-canadisk-maple-leaf-guldmont'
2024-07-25 11:05:50 +00:00
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
@staticmethod
def page_url() -> str:
2024-08-25 19:18:55 +00:00
return f'{URL_API_ROOT}/guld/1oz-canadisk-maple-leaf-guldmont/'
2024-07-25 11:05:50 +00:00
2024-07-25 11:06:05 +00:00
2024-07-25 11:05:50 +00:00
@dataclasses.dataclass(frozen=True)
class TavexScraperSilver(TavexScraperBase):
2024-07-27 00:14:01 +00:00
dataset_name = 'prices_tavex/solv-1-oz-american-eagle-solvmont-tidligere-argange'
2024-07-25 11:05:50 +00:00
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
@staticmethod
def page_url() -> str:
2024-08-25 19:18:55 +00:00
return f'{URL_API_ROOT}/solv/1-oz-american-eagle-solvmont-tidligere-argange/'