Tavex

2024-07-25 13:05:50 +02:00 · 2024-07-25 13:05:50 +02:00 · 9dedb18c4f
commit 9dedb18c4f
parent b8c0c5c52c
3 changed files with 104 additions and 3 deletions
--- a/personal_data/fetchers/tavex.py
+++ b/personal_data/fetchers/tavex.py
@ -0,0 +1,82 @@
+"""Scrapes Tavex.dk prices.
+
+By request of my colleague.
+"""
+
+import abc
+from decimal import Decimal
+import dataclasses
+import datetime
+import logging
+import re
+import secrets
+
+import bs4
+
+import personal_data.html_util
+import personal_data.parse_util
+from personal_data.data import DeduplicateMode, Scraper
+
+def parse_dkk_price(dkk: str) -> Decimal:
+    print(dkk)
+    if dkk.strip() == '-':
+        return None
+    return Decimal(dkk.removesuffix(' DKK').replace(',','.'))
+
+@dataclasses.dataclass(frozen=True)
+class TavexScraperBase(Scraper):
+    deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
+
+    @staticmethod
+    def requires_cfscrape() -> bool:
+        return True
+
+    @abc.abstractmethod
+    def page_url() -> str:
+        pass
+
+    def scrape(self):
+        response = self.session.get(self.page_url())
+        response.raise_for_status()
+
+        NOW = personal_data.parse_util.parse_response_datetime(response)
+
+        soup = bs4.BeautifulSoup(response.content, 'lxml')
+        soup_page = personal_data.html_util.normalize_soup_slightly(
+            soup,
+            classes=False,
+            scripts=True,
+        )
+
+        soup = soup.select_one('.product-poster__box .product-poster__table')
+
+        table = []
+        for soup_row in soup.children:
+            if isinstance(soup_row, bs4.NavigableString):
+                continue
+            table.append([soup_cell.get_text().strip() for soup_cell in soup_row if not isinstance(soup_cell, bs4.NavigableString)])
+
+        yield {
+                'time': NOW,
+                'buy': parse_dkk_price(table[1][1]),
+                'sell': parse_dkk_price(table[1][2]),
+                'spread_percentage': Decimal(table[1][3].removesuffix('%')) if len(table[1]) > 3 else None
+        }
+
+@dataclasses.dataclass(frozen=True)
+class TavexScraperGold(TavexScraperBase):
+    dataset_name = 'prices_tavex_gold'
+    deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
+
+    @staticmethod
+    def page_url() -> str:
+        return 'https://tavex.dk/guld/1oz-canadisk-maple-leaf-guldmont/'
+
+@dataclasses.dataclass(frozen=True)
+class TavexScraperSilver(TavexScraperBase):
+    dataset_name = 'prices_tavex_silver'
+    deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
+
+    @staticmethod
+    def page_url() -> str:
+        return 'https://tavex.dk/solv/1-oz-american-eagle-solvmont-tidligere-argange/'
--- a/personal_data/html_util.py
+++ b/personal_data/html_util.py
@ -57,8 +57,22 @@ def normalize_soup(soup) -> bytes:
        text = normalize_soup_lxml(soup).text_content()
    return normalize_text(text)

+def data_attributes_of_element(e):
+    for attr_key in list(e.attrs.keys()):
+        if attr_key.startswith('data-'):
+            yield attr_key

-def normalize_soup_slightly(soup, classes=True, scripts=True, comments=True):
+def has_data_attribute(e) -> bool:
+    for attr_key in data_attributes_of_element(e):
+        return True
+    return False
+
+def normalize_soup_slightly(soup,
+                            classes=True,
+                            scripts=True,
+                            comments=True,
+                            data_attributes=True):
+    """Perform soup normalization."""
    # Little if any content
    for tag in HTML_TAGS_MOSTLY_CONTENTLESS:
        for e in soup.select(tag):
@ -85,5 +99,11 @@ def normalize_soup_slightly(soup, classes=True, scripts=True, comments=True):
        for c in soup.find_all(string=lambda text: isinstance(text, bs4.Comment)):
            c.extract()

+    if data_attributes:
+        for e in soup.find_all(has_data_attribute):
+            for attr_key in data_attributes_of_element(e):
+                del e[attr_key], attr_key
+            del e
+
    soup.smooth()
    return soup
--- a/personal_data/parse_util.py
+++ b/personal_data/parse_util.py
@ -30,8 +30,7 @@ def parse_duration(text: str) -> datetime.timedelta:


 def parse_response_datetime(response) -> datetime.datetime:
-    return datetime.datetime.strptime(response.headers['Date'], FORMAT_DATE_HEADER)
-
+    return datetime.datetime.strptime(response.headers['Date'], FORMAT_DATE_HEADER).replace(tzinfo=datetime.UTC)

 def parse_time(text: str) -> datetime.datetime:
    text = text.replace('\n', ' ')