1
0
personal-data/personal_data/fetchers/myanimelist.py
Jon Michael Aanes 5ef8787a30
All checks were successful
Run Python tests (through Pytest) / Test (push) Successful in 36s
Verify Python project can be installed, loaded and have version checked / Test (push) Successful in 30s
Download MAL songs
2025-02-01 21:51:58 +01:00

124 lines
4.1 KiB
Python

import abc
import bs4
import re
import urllib.parse
import json
import dataclasses
import logging
import secrets
from collections.abc import Iterator, Mapping
from enum import Enum
from personal_data.data import DeduplicateMode, Scraper
logger = logging.getLogger(__name__)
@dataclasses.dataclass(frozen=True)
class MyAnimeListAnime:
series_name_eng: str
series_name: str
series_myanimelist_url: urllib.parse.ParseResult
series_icon: urllib.parse.ParseResult
me_score: int
@dataclasses.dataclass(frozen=True)
class MyAnimeListSong:
song_name_eng: str
song_name_jp: str | None
song_artist: str
song_placement: str
series_name_eng: str
series_name: str
@dataclasses.dataclass(frozen=True)
class MyAnimeList(Scraper):
dataset_name = 'myanimelist_anime'
deduplicate_mode = DeduplicateMode.BY_FIRST_COLUMN
def scrape(self) -> Iterator[MyAnimeListAnime]:
username = 'WhereTheDogGoin'
url = f'https://myanimelist.net/animelist/{username}'
response = self.session.get(url)
response.raise_for_status()
soup = bs4.BeautifulSoup(response.text)
data_items_soup = soup.select('[data-items]')[0]
data_items = json.loads(data_items_soup.get('data-items'))
for data_item in data_items:
yield MyAnimeListAnime(
series_name_eng= data_item.get('anime_title_eng') or data_item.get('anime_title'),
series_name= data_item.get('anime_title') or data_item.get('anime_title_eng'),
series_myanimelist_url= urllib.parse.urlparse(urllib.parse.urljoin(url, data_item['anime_url'])),
series_icon= urllib.parse.urlparse(urllib.parse.urljoin(url, data_item['anime_image_path'])),
me_score= data_item.get('score'),
)
del data_item
def parse_name(text: str):
match = re.fullmatch(r'^(?:\d+:\s*)?"(.*?)(?:\((.*)\))?"$', text)
return match
assert parse_name('"Soundscape"')
assert parse_name('"Soundscape (サウンドスケープ)"').group(2) is not None
assert parse_name('1: "Soundscape"')
assert parse_name('2: "Soundscape (サウンドスケープ)"').group(2) is not None
def parse_songs(tr_elements, song_position: str, series_name_eng: str, series_name: str):
print(series_name_eng, len(tr_elements))
for song_tr in tr_elements:
artist = song_tr.select_one('.theme-song-artist')
if artist is None:
continue
artist.extract()
if e := song_tr.select_one('.theme-song-episode'):
e.extract()
del e
song_artist = artist.get_text().strip().removeprefix('by ')
song_name_eng = song_tr.get_text().strip()
m = parse_name(song_name_eng )
song_name_eng = m.group(1).strip()
song_name_jp = m.group(2).strip() if m.group(2) else None
song= MyAnimeListSong(
song_name_eng = song_name_eng ,
song_name_jp = song_name_jp ,
song_artist = song_artist,
song_placement = song_position,
series_name_eng = series_name_eng,
series_name = series_name,
)
print(' ', song_name_eng)
yield song
@dataclasses.dataclass(frozen=True)
class MyAnimeListSongs(Scraper):
dataset_name = 'myanimelist_songs'
deduplicate_mode = DeduplicateMode.BY_FIRST_COLUMN
def get_songs_for_anime(self, anime: MyAnimeListAnime):
response = self.session.get(anime.series_myanimelist_url.geturl())
response.raise_for_status()
soup = bs4.BeautifulSoup(response.text)
for script in soup.select('script'):
script.extract()
for script in soup.select('.oped-popup'):
script.extract()
yield from parse_songs(soup.select('.theme-songs.opnening table tr'),
'opening', anime.series_name_eng, anime.series_name)
yield from parse_songs(soup.select('.theme-songs.ending table tr'),
'ending', anime.series_name_eng, anime.series_name)
def scrape(self) -> Iterator[MyAnimeListSong]:
for anime in MyAnimeList(self.session).scrape():
yield from self.get_songs_for_anime(anime)