import abc import bs4 import re import urllib.parse import json import dataclasses import logging import secrets from collections.abc import Iterator, Mapping from enum import Enum from personal_data.data import DeduplicateMode, Scraper logger = logging.getLogger(__name__) @dataclasses.dataclass(frozen=True) class MyAnimeListAnime: series_name_eng: str series_name: str series_myanimelist_url: urllib.parse.ParseResult series_icon: urllib.parse.ParseResult me_score: int @dataclasses.dataclass(frozen=True) class MyAnimeListSong: song_name_eng: str song_name_jp: str | None song_artist: str song_placement: str series_name_eng: str series_name: str @dataclasses.dataclass(frozen=True) class MyAnimeList(Scraper): dataset_name = 'myanimelist_anime' deduplicate_mode = DeduplicateMode.BY_FIRST_COLUMN def scrape(self) -> Iterator[MyAnimeListAnime]: username = 'WhereTheDogGoin' url = f'https://myanimelist.net/animelist/{username}' response = self.session.get(url) response.raise_for_status() soup = bs4.BeautifulSoup(response.text) data_items_soup = soup.select('[data-items]')[0] data_items = json.loads(data_items_soup.get('data-items')) for data_item in data_items: yield MyAnimeListAnime( series_name_eng= data_item.get('anime_title_eng') or data_item.get('anime_title'), series_name= data_item.get('anime_title') or data_item.get('anime_title_eng'), series_myanimelist_url= urllib.parse.urlparse(urllib.parse.urljoin(url, data_item['anime_url'])), series_icon= urllib.parse.urlparse(urllib.parse.urljoin(url, data_item['anime_image_path'])), me_score= data_item.get('score'), ) del data_item def parse_name(text: str): match = re.fullmatch(r'^(?:\d+:\s*)?"(.*?)(?:\((.*)\))?"$', text) return match assert parse_name('"Soundscape"') assert parse_name('"Soundscape (サウンドスケープ)"').group(2) is not None assert parse_name('1: "Soundscape"') assert parse_name('2: "Soundscape (サウンドスケープ)"').group(2) is not None def parse_songs(tr_elements, song_position: str, series_name_eng: str, series_name: str): print(series_name_eng, len(tr_elements)) for song_tr in tr_elements: artist = song_tr.select_one('.theme-song-artist') if artist is None: continue artist.extract() if e := song_tr.select_one('.theme-song-episode'): e.extract() del e song_artist = artist.get_text().strip().removeprefix('by ') song_name_eng = song_tr.get_text().strip() m = parse_name(song_name_eng ) song_name_eng = m.group(1).strip() song_name_jp = m.group(2).strip() if m.group(2) else None song= MyAnimeListSong( song_name_eng = song_name_eng , song_name_jp = song_name_jp , song_artist = song_artist, song_placement = song_position, series_name_eng = series_name_eng, series_name = series_name, ) print(' ', song_name_eng) yield song @dataclasses.dataclass(frozen=True) class MyAnimeListSongs(Scraper): dataset_name = 'myanimelist_songs' deduplicate_mode = DeduplicateMode.BY_FIRST_COLUMN def get_songs_for_anime(self, anime: MyAnimeListAnime): response = self.session.get(anime.series_myanimelist_url.geturl()) response.raise_for_status() soup = bs4.BeautifulSoup(response.text) for script in soup.select('script'): script.extract() for script in soup.select('.oped-popup'): script.extract() yield from parse_songs(soup.select('.theme-songs.opnening table tr'), 'opening', anime.series_name_eng, anime.series_name) yield from parse_songs(soup.select('.theme-songs.ending table tr'), 'ending', anime.series_name_eng, anime.series_name) def scrape(self) -> Iterator[MyAnimeListSong]: for anime in MyAnimeList(self.session).scrape(): yield from self.get_songs_for_anime(anime)