diff --git a/personal_data/fetchers/myanimelist.py b/personal_data/fetchers/myanimelist.py index eb9f333..486ccf8 100644 --- a/personal_data/fetchers/myanimelist.py +++ b/personal_data/fetchers/myanimelist.py @@ -1,5 +1,6 @@ import abc import bs4 +import re import urllib.parse import json import dataclasses @@ -20,6 +21,16 @@ class MyAnimeListAnime: series_icon: urllib.parse.ParseResult me_score: int +@dataclasses.dataclass(frozen=True) +class MyAnimeListSong: + song_name_eng: str + song_name_jp: str | None + song_artist: str + song_placement: str + series_name_eng: str + series_name: str + + @dataclasses.dataclass(frozen=True) class MyAnimeList(Scraper): dataset_name = 'myanimelist_anime' @@ -45,3 +56,68 @@ class MyAnimeList(Scraper): ) del data_item + +def parse_name(text: str): + match = re.fullmatch(r'^(?:\d+:\s*)?"(.*?)(?:\((.*)\))?"$', text) + return match + +assert parse_name('"Soundscape"') +assert parse_name('"Soundscape (サウンドスケープ)"').group(2) is not None +assert parse_name('1: "Soundscape"') +assert parse_name('2: "Soundscape (サウンドスケープ)"').group(2) is not None + +def parse_songs(tr_elements, song_position: str, series_name_eng: str, series_name: str): + print(series_name_eng, len(tr_elements)) + for song_tr in tr_elements: + artist = song_tr.select_one('.theme-song-artist') + if artist is None: + continue + artist.extract() + if e := song_tr.select_one('.theme-song-episode'): + e.extract() + del e + + + song_artist = artist.get_text().strip().removeprefix('by ') + + song_name_eng = song_tr.get_text().strip() + m = parse_name(song_name_eng ) + + song_name_eng = m.group(1).strip() + song_name_jp = m.group(2).strip() if m.group(2) else None + + song= MyAnimeListSong( + song_name_eng = song_name_eng , + song_name_jp = song_name_jp , + song_artist = song_artist, + song_placement = song_position, + series_name_eng = series_name_eng, + series_name = series_name, + ) + print(' ', song_name_eng) + yield song + +@dataclasses.dataclass(frozen=True) +class MyAnimeListSongs(Scraper): + dataset_name = 'myanimelist_songs' + deduplicate_mode = DeduplicateMode.BY_FIRST_COLUMN + + def get_songs_for_anime(self, anime: MyAnimeListAnime): + response = self.session.get(anime.series_myanimelist_url.geturl()) + response.raise_for_status() + + soup = bs4.BeautifulSoup(response.text) + + for script in soup.select('script'): + script.extract() + for script in soup.select('.oped-popup'): + script.extract() + + yield from parse_songs(soup.select('.theme-songs.opnening table tr'), + 'opening', anime.series_name_eng, anime.series_name) + yield from parse_songs(soup.select('.theme-songs.ending table tr'), + 'ending', anime.series_name_eng, anime.series_name) + + def scrape(self) -> Iterator[MyAnimeListSong]: + for anime in MyAnimeList(self.session).scrape(): + yield from self.get_songs_for_anime(anime)