import dataclasses import json import logging import re import urllib.parse from collections.abc import Iterator import bs4 from personal_data.data import DeduplicateMode, Scraper logger = logging.getLogger(__name__) @dataclasses.dataclass(frozen=True) class MyAnimeListAnime: series_name_eng: str series_name: str series_myanimelist_url: urllib.parse.ParseResult series_icon: urllib.parse.ParseResult me_score: int @dataclasses.dataclass(frozen=True) class MyAnimeListSong: song_name_eng: str song_name_jp: str | None song_artist: str song_placement: str series_name_eng: str series_name: str @dataclasses.dataclass(frozen=True) class MyAnimeList(Scraper): dataset_name = 'myanimelist_anime' deduplicate_mode = DeduplicateMode.BY_FIRST_COLUMN def scrape(self) -> Iterator[MyAnimeListAnime]: username = 'WhereTheDogGoin' url = f'https://myanimelist.net/animelist/{username}' response = self.session.get(url) response.raise_for_status() soup = bs4.BeautifulSoup(response.text) data_items_soup = soup.select('[data-items]')[0] data_items = json.loads(data_items_soup.get('data-items')) for data_item in data_items: yield MyAnimeListAnime( series_name_eng=data_item.get('anime_title_eng') or data_item.get('anime_title'), series_name=data_item.get('anime_title') or data_item.get('anime_title_eng'), series_myanimelist_url=urllib.parse.urlparse( urllib.parse.urljoin(url, data_item['anime_url']), ), series_icon=urllib.parse.urlparse( urllib.parse.urljoin(url, data_item['anime_image_path']), ), me_score=data_item.get('score'), ) del data_item def parse_name(text: str): match = re.fullmatch(r'^(?:\d+:\s*)?"(.*?)(?:\((.*)\))?"$', text) return match def parse_songs( tr_elements, song_position: str, series_name_eng: str, series_name: str, ): print(series_name_eng, len(tr_elements)) for song_tr in tr_elements: artist = song_tr.select_one('.theme-song-artist') if artist is None: continue artist.extract() if e := song_tr.select_one('.theme-song-episode'): e.extract() del e song_artist = artist.get_text().strip().removeprefix('by ') song_name_eng = song_tr.get_text().strip() m = parse_name(song_name_eng) song_name_eng = m.group(1).strip() song_name_jp = m.group(2).strip() if m.group(2) else None song = MyAnimeListSong( song_name_eng=song_name_eng, song_name_jp=song_name_jp, song_artist=song_artist, song_placement=song_position, series_name_eng=series_name_eng, series_name=series_name, ) print(' ', song_name_eng) yield song @dataclasses.dataclass(frozen=True) class MyAnimeListSongs(Scraper): dataset_name = 'myanimelist_songs' deduplicate_mode = DeduplicateMode.BY_FIRST_COLUMN def get_songs_for_anime(self, anime: MyAnimeListAnime): response = self.session.get(anime.series_myanimelist_url.geturl()) response.raise_for_status() soup = bs4.BeautifulSoup(response.text) for script in soup.select('script'): script.extract() for script in soup.select('.oped-popup'): script.extract() yield from parse_songs( soup.select('.theme-songs.opnening table tr'), 'opening', anime.series_name_eng, anime.series_name, ) yield from parse_songs( soup.select('.theme-songs.ending table tr'), 'ending', anime.series_name_eng, anime.series_name, ) def scrape(self) -> Iterator[MyAnimeListSong]: for anime in MyAnimeList(self.session).scrape(): yield from self.get_songs_for_anime(anime)