1
0
personal-data/personal_data/fetchers/myanimelist.py

124 lines
4.1 KiB
Python
Raw Normal View History

2025-02-01 19:00:21 +00:00
import abc
import bs4
2025-02-01 20:51:58 +00:00
import re
2025-02-01 19:00:21 +00:00
import urllib.parse
import json
import dataclasses
import logging
import secrets
from collections.abc import Iterator, Mapping
from enum import Enum
from personal_data.data import DeduplicateMode, Scraper
logger = logging.getLogger(__name__)
2025-02-01 19:33:54 +00:00
@dataclasses.dataclass(frozen=True)
class MyAnimeListAnime:
series_name_eng: str
series_name: str
series_myanimelist_url: urllib.parse.ParseResult
series_icon: urllib.parse.ParseResult
me_score: int
2025-02-01 20:51:58 +00:00
@dataclasses.dataclass(frozen=True)
class MyAnimeListSong:
song_name_eng: str
song_name_jp: str | None
song_artist: str
song_placement: str
series_name_eng: str
series_name: str
2025-02-01 19:00:21 +00:00
@dataclasses.dataclass(frozen=True)
class MyAnimeList(Scraper):
dataset_name = 'myanimelist_anime'
deduplicate_mode = DeduplicateMode.BY_FIRST_COLUMN
2025-02-01 19:33:54 +00:00
def scrape(self) -> Iterator[MyAnimeListAnime]:
2025-02-01 19:00:21 +00:00
username = 'WhereTheDogGoin'
url = f'https://myanimelist.net/animelist/{username}'
response = self.session.get(url)
response.raise_for_status()
soup = bs4.BeautifulSoup(response.text)
data_items_soup = soup.select('[data-items]')[0]
data_items = json.loads(data_items_soup.get('data-items'))
for data_item in data_items:
2025-02-01 19:33:54 +00:00
yield MyAnimeListAnime(
series_name_eng= data_item.get('anime_title_eng') or data_item.get('anime_title'),
series_name= data_item.get('anime_title') or data_item.get('anime_title_eng'),
series_myanimelist_url= urllib.parse.urlparse(urllib.parse.urljoin(url, data_item['anime_url'])),
series_icon= urllib.parse.urlparse(urllib.parse.urljoin(url, data_item['anime_image_path'])),
me_score= data_item.get('score'),
)
2025-02-01 19:00:21 +00:00
del data_item
2025-02-01 20:51:58 +00:00
def parse_name(text: str):
match = re.fullmatch(r'^(?:\d+:\s*)?"(.*?)(?:\((.*)\))?"$', text)
return match
assert parse_name('"Soundscape"')
assert parse_name('"Soundscape (サウンドスケープ)"').group(2) is not None
assert parse_name('1: "Soundscape"')
assert parse_name('2: "Soundscape (サウンドスケープ)"').group(2) is not None
def parse_songs(tr_elements, song_position: str, series_name_eng: str, series_name: str):
print(series_name_eng, len(tr_elements))
for song_tr in tr_elements:
artist = song_tr.select_one('.theme-song-artist')
if artist is None:
continue
artist.extract()
if e := song_tr.select_one('.theme-song-episode'):
e.extract()
del e
song_artist = artist.get_text().strip().removeprefix('by ')
song_name_eng = song_tr.get_text().strip()
m = parse_name(song_name_eng )
song_name_eng = m.group(1).strip()
song_name_jp = m.group(2).strip() if m.group(2) else None
song= MyAnimeListSong(
song_name_eng = song_name_eng ,
song_name_jp = song_name_jp ,
song_artist = song_artist,
song_placement = song_position,
series_name_eng = series_name_eng,
series_name = series_name,
)
print(' ', song_name_eng)
yield song
@dataclasses.dataclass(frozen=True)
class MyAnimeListSongs(Scraper):
dataset_name = 'myanimelist_songs'
deduplicate_mode = DeduplicateMode.BY_FIRST_COLUMN
def get_songs_for_anime(self, anime: MyAnimeListAnime):
response = self.session.get(anime.series_myanimelist_url.geturl())
response.raise_for_status()
soup = bs4.BeautifulSoup(response.text)
for script in soup.select('script'):
script.extract()
for script in soup.select('.oped-popup'):
script.extract()
yield from parse_songs(soup.select('.theme-songs.opnening table tr'),
'opening', anime.series_name_eng, anime.series_name)
yield from parse_songs(soup.select('.theme-songs.ending table tr'),
'ending', anime.series_name_eng, anime.series_name)
def scrape(self) -> Iterator[MyAnimeListSong]:
for anime in MyAnimeList(self.session).scrape():
yield from self.get_songs_for_anime(anime)