Download MAL songs
This commit is contained in:
parent
6749479f38
commit
5ef8787a30
|
@ -1,5 +1,6 @@
|
|||
import abc
|
||||
import bs4
|
||||
import re
|
||||
import urllib.parse
|
||||
import json
|
||||
import dataclasses
|
||||
|
@ -20,6 +21,16 @@ class MyAnimeListAnime:
|
|||
series_icon: urllib.parse.ParseResult
|
||||
me_score: int
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class MyAnimeListSong:
|
||||
song_name_eng: str
|
||||
song_name_jp: str | None
|
||||
song_artist: str
|
||||
song_placement: str
|
||||
series_name_eng: str
|
||||
series_name: str
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class MyAnimeList(Scraper):
|
||||
dataset_name = 'myanimelist_anime'
|
||||
|
@ -45,3 +56,68 @@ class MyAnimeList(Scraper):
|
|||
)
|
||||
|
||||
del data_item
|
||||
|
||||
def parse_name(text: str):
|
||||
match = re.fullmatch(r'^(?:\d+:\s*)?"(.*?)(?:\((.*)\))?"$', text)
|
||||
return match
|
||||
|
||||
assert parse_name('"Soundscape"')
|
||||
assert parse_name('"Soundscape (サウンドスケープ)"').group(2) is not None
|
||||
assert parse_name('1: "Soundscape"')
|
||||
assert parse_name('2: "Soundscape (サウンドスケープ)"').group(2) is not None
|
||||
|
||||
def parse_songs(tr_elements, song_position: str, series_name_eng: str, series_name: str):
|
||||
print(series_name_eng, len(tr_elements))
|
||||
for song_tr in tr_elements:
|
||||
artist = song_tr.select_one('.theme-song-artist')
|
||||
if artist is None:
|
||||
continue
|
||||
artist.extract()
|
||||
if e := song_tr.select_one('.theme-song-episode'):
|
||||
e.extract()
|
||||
del e
|
||||
|
||||
|
||||
song_artist = artist.get_text().strip().removeprefix('by ')
|
||||
|
||||
song_name_eng = song_tr.get_text().strip()
|
||||
m = parse_name(song_name_eng )
|
||||
|
||||
song_name_eng = m.group(1).strip()
|
||||
song_name_jp = m.group(2).strip() if m.group(2) else None
|
||||
|
||||
song= MyAnimeListSong(
|
||||
song_name_eng = song_name_eng ,
|
||||
song_name_jp = song_name_jp ,
|
||||
song_artist = song_artist,
|
||||
song_placement = song_position,
|
||||
series_name_eng = series_name_eng,
|
||||
series_name = series_name,
|
||||
)
|
||||
print(' ', song_name_eng)
|
||||
yield song
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class MyAnimeListSongs(Scraper):
|
||||
dataset_name = 'myanimelist_songs'
|
||||
deduplicate_mode = DeduplicateMode.BY_FIRST_COLUMN
|
||||
|
||||
def get_songs_for_anime(self, anime: MyAnimeListAnime):
|
||||
response = self.session.get(anime.series_myanimelist_url.geturl())
|
||||
response.raise_for_status()
|
||||
|
||||
soup = bs4.BeautifulSoup(response.text)
|
||||
|
||||
for script in soup.select('script'):
|
||||
script.extract()
|
||||
for script in soup.select('.oped-popup'):
|
||||
script.extract()
|
||||
|
||||
yield from parse_songs(soup.select('.theme-songs.opnening table tr'),
|
||||
'opening', anime.series_name_eng, anime.series_name)
|
||||
yield from parse_songs(soup.select('.theme-songs.ending table tr'),
|
||||
'ending', anime.series_name_eng, anime.series_name)
|
||||
|
||||
def scrape(self) -> Iterator[MyAnimeListSong]:
|
||||
for anime in MyAnimeList(self.session).scrape():
|
||||
yield from self.get_songs_for_anime(anime)
|
||||
|
|
Loading…
Reference in New Issue
Block a user