1
0
personal-data/personal_data/fetchers/crunchyroll.py

94 lines
3.3 KiB
Python
Raw Normal View History

2024-03-03 15:59:03 +00:00
import dataclasses
2024-03-31 22:55:55 +00:00
import logging
import secrets
2024-01-28 21:29:29 +00:00
2024-03-31 22:55:55 +00:00
from personal_data.data import DeduplicateMode, Scraper
2024-01-28 21:33:30 +00:00
2024-01-28 21:29:29 +00:00
logger = logging.getLogger(__name__)
API_ROOT = 'https://www.crunchyroll.com'
API_URL_TOKEN = API_ROOT + '/auth/v1/token'
API_URL_ME = API_ROOT + '/accounts/v1/me'
2024-03-31 22:55:55 +00:00
API_URL_WATCH_LIST = (
API_ROOT + '/content/v2/{account_uuid}/watch-history?page_size=100&locale=en-US'
)
2024-01-28 21:29:29 +00:00
2024-03-31 22:55:55 +00:00
@dataclasses.dataclass(frozen=True)
2024-03-03 15:59:03 +00:00
class CrunchyrollScraper(Scraper):
dataset_name = 'episodes_watched_crunchyroll'
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
2024-01-28 21:29:29 +00:00
2024-03-03 15:59:03 +00:00
def scrape(self):
headers = {
2024-03-31 22:55:55 +00:00
'Referer': 'https://www.crunchyroll.com/history',
'Authorization': secrets.CRUNCHYROLL_AUTH, # TODO: Determine automatically
2024-03-03 15:59:03 +00:00
}
2024-01-28 21:29:29 +00:00
2024-03-03 15:59:03 +00:00
# Request to get account UUID
logger.info('Getting Access Token')
2024-03-31 22:55:55 +00:00
response = self.session.post(
API_URL_TOKEN,
headers=headers,
cookies=self.session.cookies,
data={
'device_id': secrets.CRUNCHYROLL_DEVICE_ID, # TODO: Determine automatically
'device_type': 'Firefox on Linux',
'grant_type': 'etp_rt_cookie',
},
)
2024-01-28 21:29:29 +00:00
2024-03-03 15:59:03 +00:00
response.raise_for_status()
data_me = response.json()
2024-03-31 22:55:55 +00:00
headers['Authorization'] = '{} {}'.format(
data_me['token_type'],
data_me['access_token'],
)
2024-03-03 15:59:03 +00:00
account_uuid = data_me['account_id']
2024-01-28 21:29:29 +00:00
2024-03-03 15:59:03 +00:00
logger.info(' Account UUID: %s', account_uuid)
2024-01-28 21:29:29 +00:00
2024-03-03 15:59:03 +00:00
# Request to get watch history
logger.info('Getting Watchlist')
2024-03-31 22:55:55 +00:00
response = self.session.get(
API_URL_WATCH_LIST.format(account_uuid=account_uuid),
headers=headers,
)
2024-03-03 15:59:03 +00:00
response.raise_for_status()
2024-01-28 21:29:29 +00:00
2024-03-03 15:59:03 +00:00
# Parse data
episodes_data = response.json()['data']
logger.info(' Watchlist length: %d', len(episodes_data))
2024-01-28 21:29:29 +00:00
2024-03-03 15:59:03 +00:00
for episode_data in episodes_data:
yield {
2024-03-31 22:55:55 +00:00
# Sorting fields
'datetime_played': episode_data['date_played'],
# Important fields
'series.title': episode_data['panel']['episode_metadata'][
'series_title'
],
'season.number': episode_data['panel']['episode_metadata'][
'season_number'
],
'episode.number': episode_data['panel']['episode_metadata']['episode'],
'episode.name': episode_data['panel']['title'],
# Secondary fields
'episode.language': episode_data['panel']['episode_metadata'][
'audio_locale'
],
'episode.duration_ms': episode_data['panel']['episode_metadata'][
'duration_ms'
],
'episode.maturity_ratings': ' '.join(
episode_data['panel']['episode_metadata']['maturity_ratings'],
),
'season.title': episode_data['panel']['episode_metadata'][
'season_title'
],
'fully_watched': episode_data['fully_watched'],
# Identifiers
'episode.crunchyroll_id': episode_data['id'],
'series.crunchyroll_id': episode_data['parent_id'],
2024-03-03 15:59:03 +00:00
}