import dataclasses import logging import secrets from personal_data.data import DeduplicateMode, Scraper logger = logging.getLogger(__name__) API_ROOT = 'https://www.crunchyroll.com' API_URL_TOKEN = API_ROOT + '/auth/v1/token' API_URL_ME = API_ROOT + '/accounts/v1/me' API_URL_WATCH_LIST = ( API_ROOT + '/content/v2/{account_uuid}/watch-history?page_size=100&locale=en-US' ) @dataclasses.dataclass(frozen=True) class CrunchyrollScraper(Scraper): dataset_name = 'episodes_watched_crunchyroll' deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS def scrape(self): headers = { 'Referer': 'https://www.crunchyroll.com/history', 'Authorization': secrets.CRUNCHYROLL_AUTH, # TODO: Determine automatically } # Request to get account UUID logger.info('Getting Access Token') response = self.session.post( API_URL_TOKEN, headers=headers, cookies=self.session.cookies, data={ 'device_id': secrets.CRUNCHYROLL_DEVICE_ID, # TODO: Determine automatically 'device_type': 'Firefox on Linux', 'grant_type': 'etp_rt_cookie', }, ) response.raise_for_status() data_me = response.json() headers['Authorization'] = '{} {}'.format( data_me['token_type'], data_me['access_token'], ) account_uuid = data_me['account_id'] logger.info(' Account UUID: %s', account_uuid) # Request to get watch history logger.info('Getting Watchlist') response = self.session.get( API_URL_WATCH_LIST.format(account_uuid=account_uuid), headers=headers, ) response.raise_for_status() # Parse data episodes_data = response.json()['data'] logger.info(' Watchlist length: %d', len(episodes_data)) for episode_data in episodes_data: yield { # Sorting fields 'datetime_played': episode_data['date_played'], # Important fields 'series.title': episode_data['panel']['episode_metadata'][ 'series_title' ], 'season.number': episode_data['panel']['episode_metadata'][ 'season_number' ], 'episode.number': episode_data['panel']['episode_metadata']['episode'], 'episode.name': episode_data['panel']['title'], # Secondary fields 'episode.language': episode_data['panel']['episode_metadata'][ 'audio_locale' ], 'episode.duration_ms': episode_data['panel']['episode_metadata'][ 'duration_ms' ], 'episode.maturity_ratings': ' '.join( episode_data['panel']['episode_metadata']['maturity_ratings'], ), 'season.title': episode_data['panel']['episode_metadata'][ 'season_title' ], 'fully_watched': episode_data['fully_watched'], # Identifiers 'episode.crunchyroll_id': episode_data['id'], 'series.crunchyroll_id': episode_data['parent_id'], }