From 9dedb18c4f79016e9a42804b9267e396178d290d Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Thu, 25 Jul 2024 13:05:50 +0200 Subject: [PATCH] Tavex --- personal_data/fetchers/tavex.py | 82 +++++++++++++++++++++++++++++++++ personal_data/html_util.py | 22 ++++++++- personal_data/parse_util.py | 3 +- 3 files changed, 104 insertions(+), 3 deletions(-) create mode 100644 personal_data/fetchers/tavex.py diff --git a/personal_data/fetchers/tavex.py b/personal_data/fetchers/tavex.py new file mode 100644 index 0000000..bff1b4d --- /dev/null +++ b/personal_data/fetchers/tavex.py @@ -0,0 +1,82 @@ +"""Scrapes Tavex.dk prices. + +By request of my colleague. +""" + +import abc +from decimal import Decimal +import dataclasses +import datetime +import logging +import re +import secrets + +import bs4 + +import personal_data.html_util +import personal_data.parse_util +from personal_data.data import DeduplicateMode, Scraper + +def parse_dkk_price(dkk: str) -> Decimal: + print(dkk) + if dkk.strip() == '-': + return None + return Decimal(dkk.removesuffix(' DKK').replace(',','.')) + +@dataclasses.dataclass(frozen=True) +class TavexScraperBase(Scraper): + deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS + + @staticmethod + def requires_cfscrape() -> bool: + return True + + @abc.abstractmethod + def page_url() -> str: + pass + + def scrape(self): + response = self.session.get(self.page_url()) + response.raise_for_status() + + NOW = personal_data.parse_util.parse_response_datetime(response) + + soup = bs4.BeautifulSoup(response.content, 'lxml') + soup_page = personal_data.html_util.normalize_soup_slightly( + soup, + classes=False, + scripts=True, + ) + + soup = soup.select_one('.product-poster__box .product-poster__table') + + table = [] + for soup_row in soup.children: + if isinstance(soup_row, bs4.NavigableString): + continue + table.append([soup_cell.get_text().strip() for soup_cell in soup_row if not isinstance(soup_cell, bs4.NavigableString)]) + + yield { + 'time': NOW, + 'buy': parse_dkk_price(table[1][1]), + 'sell': parse_dkk_price(table[1][2]), + 'spread_percentage': Decimal(table[1][3].removesuffix('%')) if len(table[1]) > 3 else None + } + +@dataclasses.dataclass(frozen=True) +class TavexScraperGold(TavexScraperBase): + dataset_name = 'prices_tavex_gold' + deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS + + @staticmethod + def page_url() -> str: + return 'https://tavex.dk/guld/1oz-canadisk-maple-leaf-guldmont/' + +@dataclasses.dataclass(frozen=True) +class TavexScraperSilver(TavexScraperBase): + dataset_name = 'prices_tavex_silver' + deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS + + @staticmethod + def page_url() -> str: + return 'https://tavex.dk/solv/1-oz-american-eagle-solvmont-tidligere-argange/' diff --git a/personal_data/html_util.py b/personal_data/html_util.py index b363d18..e00f8bd 100644 --- a/personal_data/html_util.py +++ b/personal_data/html_util.py @@ -57,8 +57,22 @@ def normalize_soup(soup) -> bytes: text = normalize_soup_lxml(soup).text_content() return normalize_text(text) +def data_attributes_of_element(e): + for attr_key in list(e.attrs.keys()): + if attr_key.startswith('data-'): + yield attr_key -def normalize_soup_slightly(soup, classes=True, scripts=True, comments=True): +def has_data_attribute(e) -> bool: + for attr_key in data_attributes_of_element(e): + return True + return False + +def normalize_soup_slightly(soup, + classes=True, + scripts=True, + comments=True, + data_attributes=True): + """Perform soup normalization.""" # Little if any content for tag in HTML_TAGS_MOSTLY_CONTENTLESS: for e in soup.select(tag): @@ -85,5 +99,11 @@ def normalize_soup_slightly(soup, classes=True, scripts=True, comments=True): for c in soup.find_all(string=lambda text: isinstance(text, bs4.Comment)): c.extract() + if data_attributes: + for e in soup.find_all(has_data_attribute): + for attr_key in data_attributes_of_element(e): + del e[attr_key], attr_key + del e + soup.smooth() return soup diff --git a/personal_data/parse_util.py b/personal_data/parse_util.py index 8fa0312..7ebeca1 100644 --- a/personal_data/parse_util.py +++ b/personal_data/parse_util.py @@ -30,8 +30,7 @@ def parse_duration(text: str) -> datetime.timedelta: def parse_response_datetime(response) -> datetime.datetime: - return datetime.datetime.strptime(response.headers['Date'], FORMAT_DATE_HEADER) - + return datetime.datetime.strptime(response.headers['Date'], FORMAT_DATE_HEADER).replace(tzinfo=datetime.UTC) def parse_time(text: str) -> datetime.datetime: text = text.replace('\n', ' ')