diff --git a/personal_data/fetchers/crunchyroll.py b/personal_data/fetchers/crunchyroll.py index 8447285..3472283 100644 --- a/personal_data/fetchers/crunchyroll.py +++ b/personal_data/fetchers/crunchyroll.py @@ -1,6 +1,7 @@ import dataclasses import logging import secrets +from collections.abc import Iterator, Mapping from personal_data.data import DeduplicateMode, Scraper @@ -19,7 +20,7 @@ class CrunchyrollScraper(Scraper): dataset_name = 'episodes_watched_crunchyroll' deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS - def scrape(self): + def scrape(self) -> Iterator[Mapping[str, object]]: headers = { 'Referer': 'https://www.crunchyroll.com/history', 'Authorization': secrets.CRUNCHYROLL_AUTH, # TODO: Determine automatically diff --git a/personal_data/fetchers/home_assistant.py b/personal_data/fetchers/home_assistant.py index d52ddc4..b9e8b56 100644 --- a/personal_data/fetchers/home_assistant.py +++ b/personal_data/fetchers/home_assistant.py @@ -28,7 +28,6 @@ class HomeAssistantScaleWeight(Scraper): end_time = datetime.datetime.now() start_time = end_time - datetime.timedelta(days=90) url = f'{HA_ROOT}/api/history/period/{start_time}' - print(url) params = { 'filter_entity_id': 'sensor.bathroom_scale_mass', 'end_time': end_time, diff --git a/personal_data/fetchers/myanimelist.py b/personal_data/fetchers/myanimelist.py new file mode 100644 index 0000000..e2c7124 --- /dev/null +++ b/personal_data/fetchers/myanimelist.py @@ -0,0 +1,41 @@ +import abc +import bs4 +import urllib.parse +import json +import dataclasses +import logging +import secrets +from collections.abc import Iterator, Mapping +from enum import Enum + +from personal_data.data import DeduplicateMode, Scraper + +logger = logging.getLogger(__name__) + +@dataclasses.dataclass(frozen=True) +class MyAnimeList(Scraper): + dataset_name = 'myanimelist_anime' + deduplicate_mode = DeduplicateMode.BY_FIRST_COLUMN + + def scrape(self) -> Iterator[Mapping[str, object]]: + username = 'WhereTheDogGoin' + url = f'https://myanimelist.net/animelist/{username}' + response = self.session.get(url) + response.raise_for_status() + + soup = bs4.BeautifulSoup(response.text) + print(soup) + data_items_soup = soup.select('[data-items]')[0] + print(data_items_soup) + data_items = json.loads(data_items_soup.get('data-items')) + + for data_item in data_items: + print(data_item) + yield { + 'series.name': data_item.get('anime_title_eng') or data_item.get('anime_title'), + 'series.myanimelist_url': urllib.parse.urljoin(url, data_item['anime_url']), + 'series.icon': urllib.parse.urljoin(url, data_item['anime_image_path']), + 'me.score': data_item.get('score'), + } + + del data_item diff --git a/personal_data/util.py b/personal_data/util.py index 4fa57f8..26631d5 100644 --- a/personal_data/util.py +++ b/personal_data/util.py @@ -114,6 +114,8 @@ def extend_csv_file( deduplicate_mode: data.DeduplicateMode, deduplicate_ignore_columns: list[str], ) -> dict: + if deduplicate_ignore_columns == data.Scraper.deduplicate_ignore_columns: + deduplicate_ignore_columns = [] if not isinstance(deduplicate_ignore_columns, list): raise TypeError(deduplicate_ignore_columns)