124 lines
4.1 KiB
Python
124 lines
4.1 KiB
Python
import abc
|
|
import bs4
|
|
import re
|
|
import urllib.parse
|
|
import json
|
|
import dataclasses
|
|
import logging
|
|
import secrets
|
|
from collections.abc import Iterator, Mapping
|
|
from enum import Enum
|
|
|
|
from personal_data.data import DeduplicateMode, Scraper
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
@dataclasses.dataclass(frozen=True)
|
|
class MyAnimeListAnime:
|
|
series_name_eng: str
|
|
series_name: str
|
|
series_myanimelist_url: urllib.parse.ParseResult
|
|
series_icon: urllib.parse.ParseResult
|
|
me_score: int
|
|
|
|
@dataclasses.dataclass(frozen=True)
|
|
class MyAnimeListSong:
|
|
song_name_eng: str
|
|
song_name_jp: str | None
|
|
song_artist: str
|
|
song_placement: str
|
|
series_name_eng: str
|
|
series_name: str
|
|
|
|
|
|
@dataclasses.dataclass(frozen=True)
|
|
class MyAnimeList(Scraper):
|
|
dataset_name = 'myanimelist_anime'
|
|
deduplicate_mode = DeduplicateMode.BY_FIRST_COLUMN
|
|
|
|
def scrape(self) -> Iterator[MyAnimeListAnime]:
|
|
username = 'WhereTheDogGoin'
|
|
url = f'https://myanimelist.net/animelist/{username}'
|
|
response = self.session.get(url)
|
|
response.raise_for_status()
|
|
|
|
soup = bs4.BeautifulSoup(response.text)
|
|
data_items_soup = soup.select('[data-items]')[0]
|
|
data_items = json.loads(data_items_soup.get('data-items'))
|
|
|
|
for data_item in data_items:
|
|
yield MyAnimeListAnime(
|
|
series_name_eng= data_item.get('anime_title_eng') or data_item.get('anime_title'),
|
|
series_name= data_item.get('anime_title') or data_item.get('anime_title_eng'),
|
|
series_myanimelist_url= urllib.parse.urlparse(urllib.parse.urljoin(url, data_item['anime_url'])),
|
|
series_icon= urllib.parse.urlparse(urllib.parse.urljoin(url, data_item['anime_image_path'])),
|
|
me_score= data_item.get('score'),
|
|
)
|
|
|
|
del data_item
|
|
|
|
def parse_name(text: str):
|
|
match = re.fullmatch(r'^(?:\d+:\s*)?"(.*?)(?:\((.*)\))?"$', text)
|
|
return match
|
|
|
|
assert parse_name('"Soundscape"')
|
|
assert parse_name('"Soundscape (サウンドスケープ)"').group(2) is not None
|
|
assert parse_name('1: "Soundscape"')
|
|
assert parse_name('2: "Soundscape (サウンドスケープ)"').group(2) is not None
|
|
|
|
def parse_songs(tr_elements, song_position: str, series_name_eng: str, series_name: str):
|
|
print(series_name_eng, len(tr_elements))
|
|
for song_tr in tr_elements:
|
|
artist = song_tr.select_one('.theme-song-artist')
|
|
if artist is None:
|
|
continue
|
|
artist.extract()
|
|
if e := song_tr.select_one('.theme-song-episode'):
|
|
e.extract()
|
|
del e
|
|
|
|
|
|
song_artist = artist.get_text().strip().removeprefix('by ')
|
|
|
|
song_name_eng = song_tr.get_text().strip()
|
|
m = parse_name(song_name_eng )
|
|
|
|
song_name_eng = m.group(1).strip()
|
|
song_name_jp = m.group(2).strip() if m.group(2) else None
|
|
|
|
song= MyAnimeListSong(
|
|
song_name_eng = song_name_eng ,
|
|
song_name_jp = song_name_jp ,
|
|
song_artist = song_artist,
|
|
song_placement = song_position,
|
|
series_name_eng = series_name_eng,
|
|
series_name = series_name,
|
|
)
|
|
print(' ', song_name_eng)
|
|
yield song
|
|
|
|
@dataclasses.dataclass(frozen=True)
|
|
class MyAnimeListSongs(Scraper):
|
|
dataset_name = 'myanimelist_songs'
|
|
deduplicate_mode = DeduplicateMode.BY_FIRST_COLUMN
|
|
|
|
def get_songs_for_anime(self, anime: MyAnimeListAnime):
|
|
response = self.session.get(anime.series_myanimelist_url.geturl())
|
|
response.raise_for_status()
|
|
|
|
soup = bs4.BeautifulSoup(response.text)
|
|
|
|
for script in soup.select('script'):
|
|
script.extract()
|
|
for script in soup.select('.oped-popup'):
|
|
script.extract()
|
|
|
|
yield from parse_songs(soup.select('.theme-songs.opnening table tr'),
|
|
'opening', anime.series_name_eng, anime.series_name)
|
|
yield from parse_songs(soup.select('.theme-songs.ending table tr'),
|
|
'ending', anime.series_name_eng, anime.series_name)
|
|
|
|
def scrape(self) -> Iterator[MyAnimeListSong]:
|
|
for anime in MyAnimeList(self.session).scrape():
|
|
yield from self.get_songs_for_anime(anime)
|