import dataclasses import datetime import logging from collections.abc import Iterator, Mapping import requests_util from personal_data.data import DeduplicateMode, Scraper from .. import secrets logger = logging.getLogger(__name__) URL_API_ROOT = 'https://api.wanikani.com/v2' URL_ASSIGNMENTS = URL_API_ROOT + '/assignments' URL_SUBJECTS = URL_API_ROOT + '/subjects/{subject_id}' @dataclasses.dataclass(frozen=True) class WaniKaniLessonsFetcher(Scraper): dataset_name = 'wanikani_lessons' deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS def _setup_cache(self): requests_util.setup_limiter( self.session, URL_API_ROOT, expire_after=datetime.timedelta(days=90), per_minute=30, ) requests_util.setup_limiter( self.session, URL_ASSIGNMENTS, expire_after=datetime.timedelta(days=3), per_minute=30, ) def scrape(self) -> Iterator[Mapping[str, object]]: """Fetch assignments from the WaniKani API and yield a dict for each assignment with a non-null unlocked_at timestamp.""" self._setup_cache() headers = { 'Authorization': f'Bearer {secrets.wanikani_api_key()}', 'Wanikani-Revision': '20170710', } url = URL_ASSIGNMENTS while url: logger.warning('Getting: %s', url) response = self.session.get(url, headers=headers) response.raise_for_status() json_resp = response.json() for assignment in json_resp.get('data', []): data_item = assignment['data'] subject_id = data_item.get('subject_id') if subject_id: subj_url = URL_SUBJECTS.format(subject_id=subject_id) logger.warning('Getting: %s', subj_url) subj_response = self.session.get(subj_url, headers=headers) subj_response.raise_for_status() subj_json = subj_response.json() subject_characters = subj_json.get('data', {}).get('characters') data_item['subject_characters'] = subject_characters yield data_item url = json_resp.get('pages', {}).get('next_url')