Tavex
This commit is contained in:
parent
b8c0c5c52c
commit
9dedb18c4f
82
personal_data/fetchers/tavex.py
Normal file
82
personal_data/fetchers/tavex.py
Normal file
|
@ -0,0 +1,82 @@
|
|||
"""Scrapes Tavex.dk prices.
|
||||
|
||||
By request of my colleague.
|
||||
"""
|
||||
|
||||
import abc
|
||||
from decimal import Decimal
|
||||
import dataclasses
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import secrets
|
||||
|
||||
import bs4
|
||||
|
||||
import personal_data.html_util
|
||||
import personal_data.parse_util
|
||||
from personal_data.data import DeduplicateMode, Scraper
|
||||
|
||||
def parse_dkk_price(dkk: str) -> Decimal:
|
||||
print(dkk)
|
||||
if dkk.strip() == '-':
|
||||
return None
|
||||
return Decimal(dkk.removesuffix(' DKK').replace(',','.'))
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class TavexScraperBase(Scraper):
|
||||
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
|
||||
|
||||
@staticmethod
|
||||
def requires_cfscrape() -> bool:
|
||||
return True
|
||||
|
||||
@abc.abstractmethod
|
||||
def page_url() -> str:
|
||||
pass
|
||||
|
||||
def scrape(self):
|
||||
response = self.session.get(self.page_url())
|
||||
response.raise_for_status()
|
||||
|
||||
NOW = personal_data.parse_util.parse_response_datetime(response)
|
||||
|
||||
soup = bs4.BeautifulSoup(response.content, 'lxml')
|
||||
soup_page = personal_data.html_util.normalize_soup_slightly(
|
||||
soup,
|
||||
classes=False,
|
||||
scripts=True,
|
||||
)
|
||||
|
||||
soup = soup.select_one('.product-poster__box .product-poster__table')
|
||||
|
||||
table = []
|
||||
for soup_row in soup.children:
|
||||
if isinstance(soup_row, bs4.NavigableString):
|
||||
continue
|
||||
table.append([soup_cell.get_text().strip() for soup_cell in soup_row if not isinstance(soup_cell, bs4.NavigableString)])
|
||||
|
||||
yield {
|
||||
'time': NOW,
|
||||
'buy': parse_dkk_price(table[1][1]),
|
||||
'sell': parse_dkk_price(table[1][2]),
|
||||
'spread_percentage': Decimal(table[1][3].removesuffix('%')) if len(table[1]) > 3 else None
|
||||
}
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class TavexScraperGold(TavexScraperBase):
|
||||
dataset_name = 'prices_tavex_gold'
|
||||
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
|
||||
|
||||
@staticmethod
|
||||
def page_url() -> str:
|
||||
return 'https://tavex.dk/guld/1oz-canadisk-maple-leaf-guldmont/'
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class TavexScraperSilver(TavexScraperBase):
|
||||
dataset_name = 'prices_tavex_silver'
|
||||
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
|
||||
|
||||
@staticmethod
|
||||
def page_url() -> str:
|
||||
return 'https://tavex.dk/solv/1-oz-american-eagle-solvmont-tidligere-argange/'
|
|
@ -57,8 +57,22 @@ def normalize_soup(soup) -> bytes:
|
|||
text = normalize_soup_lxml(soup).text_content()
|
||||
return normalize_text(text)
|
||||
|
||||
def data_attributes_of_element(e):
|
||||
for attr_key in list(e.attrs.keys()):
|
||||
if attr_key.startswith('data-'):
|
||||
yield attr_key
|
||||
|
||||
def normalize_soup_slightly(soup, classes=True, scripts=True, comments=True):
|
||||
def has_data_attribute(e) -> bool:
|
||||
for attr_key in data_attributes_of_element(e):
|
||||
return True
|
||||
return False
|
||||
|
||||
def normalize_soup_slightly(soup,
|
||||
classes=True,
|
||||
scripts=True,
|
||||
comments=True,
|
||||
data_attributes=True):
|
||||
"""Perform soup normalization."""
|
||||
# Little if any content
|
||||
for tag in HTML_TAGS_MOSTLY_CONTENTLESS:
|
||||
for e in soup.select(tag):
|
||||
|
@ -85,5 +99,11 @@ def normalize_soup_slightly(soup, classes=True, scripts=True, comments=True):
|
|||
for c in soup.find_all(string=lambda text: isinstance(text, bs4.Comment)):
|
||||
c.extract()
|
||||
|
||||
if data_attributes:
|
||||
for e in soup.find_all(has_data_attribute):
|
||||
for attr_key in data_attributes_of_element(e):
|
||||
del e[attr_key], attr_key
|
||||
del e
|
||||
|
||||
soup.smooth()
|
||||
return soup
|
||||
|
|
|
@ -30,8 +30,7 @@ def parse_duration(text: str) -> datetime.timedelta:
|
|||
|
||||
|
||||
def parse_response_datetime(response) -> datetime.datetime:
|
||||
return datetime.datetime.strptime(response.headers['Date'], FORMAT_DATE_HEADER)
|
||||
|
||||
return datetime.datetime.strptime(response.headers['Date'], FORMAT_DATE_HEADER).replace(tzinfo=datetime.UTC)
|
||||
|
||||
def parse_time(text: str) -> datetime.datetime:
|
||||
text = text.replace('\n', ' ')
|
||||
|
|
Loading…
Reference in New Issue
Block a user