1
0

Additional mastodon pages and deviant art parsing.

This commit is contained in:
Jon Michael Aanes 2023-08-06 00:35:03 +02:00
parent 5aad3e1cab
commit bdf60219fd

View File

@ -36,9 +36,12 @@ class SocialSiteId(enum.Enum):
CARRD_PAGE = 24 CARRD_PAGE = 24
HENTAI_FOUNDRY = 25 HENTAI_FOUNDRY = 25
YOUTUBE_CHANNEL_HANDLE = 26 YOUTUBE_CHANNEL_HANDLE = 26
YOUTUBE_CHANNEL_ID = 2397
VIMEO_CHANNEL = 27 VIMEO_CHANNEL = 27
NEWGROUNDS_PAGE = 28 NEWGROUNDS_PAGE = 28
ARTSY_ARTIST = 2042 ARTSY_ARTIST = 2042
LINK_COLLECTION_PAGE = 29
DEVIANT_ART_ACCOUNT = 7737
def wikidata_property(self, client): def wikidata_property(self, client):
return client.get(WIKIDATA_PROPERTIES[self]) return client.get(WIKIDATA_PROPERTIES[self])
@ -80,7 +83,7 @@ WIKIDATA_PROPERTIES = {
SocialSiteId.TIKTOK_USER: WikidataInfo(7085, None), SocialSiteId.TIKTOK_USER: WikidataInfo(7085, None),
SocialSiteId.PIXIV_USER: WikidataInfo(None, 306956), SocialSiteId.PIXIV_USER: WikidataInfo(None, 306956),
#SocialSiteId.MASTODON_PAGE: 2000 + 10, SocialSiteId.MASTODON_PAGE: WikidataInfo(4033, None),
SocialSiteId.PATREON_PAGE: WikidataInfo(4175, 15861362), SocialSiteId.PATREON_PAGE: WikidataInfo(4175, 15861362),
SocialSiteId.ARTSTATION_PAGE: WikidataInfo(None, 65551500), SocialSiteId.ARTSTATION_PAGE: WikidataInfo(None, 65551500),
#SocialSiteId.INPRNT_PAGE: WikidataInfo(None, None), #SocialSiteId.INPRNT_PAGE: WikidataInfo(None, None),
@ -88,9 +91,11 @@ WIKIDATA_PROPERTIES = {
SocialSiteId.CARRD_PAGE: WikidataInfo(None, 106036503), SocialSiteId.CARRD_PAGE: WikidataInfo(None, 106036503),
SocialSiteId.HENTAI_FOUNDRY: WikidataInfo(None, 115903301), SocialSiteId.HENTAI_FOUNDRY: WikidataInfo(None, 115903301),
SocialSiteId.YOUTUBE_CHANNEL_HANDLE: WikidataInfo(11245, 866), SocialSiteId.YOUTUBE_CHANNEL_HANDLE: WikidataInfo(11245, 866),
SocialSiteId.YOUTUBE_CHANNEL_ID: WikidataInfo(2397, None),
SocialSiteId.VIMEO_CHANNEL: WikidataInfo(4015, 156376), SocialSiteId.VIMEO_CHANNEL: WikidataInfo(4015, 156376),
SocialSiteId.NEWGROUNDS_PAGE: WikidataInfo(None, 263655), SocialSiteId.NEWGROUNDS_PAGE: WikidataInfo(None, 263655),
SocialSiteId.ARTSY_ARTIST: WikidataInfo(2042, 4796642), SocialSiteId.ARTSY_ARTIST: WikidataInfo(2042, 4796642),
SocialSiteId.DEVIANT_ART_ACCOUNT: WikidataInfo(7737, None),
} }
def re_social_subdomain(main_domain): def re_social_subdomain(main_domain):
@ -104,6 +109,7 @@ def re_social_path(main_domain):
return re_social_path_adv(main_domain, RE_ID) return re_social_path_adv(main_domain, RE_ID)
def re_social_path_adv(main_domain, *path): def re_social_path_adv(main_domain, *path):
assert not main_domain.startswith('www.'), 'Redundant www.'
l = [r'^', '(?:https?:\/\/)?', r'(?:www\.)?', re.escape(main_domain)] l = [r'^', '(?:https?:\/\/)?', r'(?:www\.)?', re.escape(main_domain)]
for p in path: for p in path:
@ -139,9 +145,13 @@ URL_PARSE_CARRD_PAGE = re_social_subdomain('carrd.co')
URL_PARSE_HENTAI_FOUNDRY= re_social_path_adv('hentai-foundry.com', 'user', RE_ID, 'profile') URL_PARSE_HENTAI_FOUNDRY= re_social_path_adv('hentai-foundry.com', 'user', RE_ID, 'profile')
URL_PARSE_YOUTUBE_CHANNEL_HANDLE_1= re_social_path_adv('youtube.com', RE_ID) URL_PARSE_YOUTUBE_CHANNEL_HANDLE_1= re_social_path_adv('youtube.com', RE_ID)
URL_PARSE_YOUTUBE_CHANNEL_HANDLE_2= re_social_path_adv('youtube.com', 'c', RE_ID) URL_PARSE_YOUTUBE_CHANNEL_HANDLE_2= re_social_path_adv('youtube.com', 'c', RE_ID)
URL_PARSE_YOUTUBE_CHANNEL_ID= re_social_path_adv('youtube.com', 'channel', RE_ID)
URL_PARSE_VIMEO_CHANNEL= re_social_path_adv('vimeo.com', RE_ID) URL_PARSE_VIMEO_CHANNEL= re_social_path_adv('vimeo.com', RE_ID)
URL_PARSE_NEWGROUNDS_PAGE = re_social_subdomain('newgrounds.com') URL_PARSE_NEWGROUNDS_PAGE = re_social_subdomain('newgrounds.com')
URL_PARSE_ARTSY_ARTIST = re_social_path_adv('artsy.net', 'artist', RE_ID) URL_PARSE_ARTSY_ARTIST = re_social_path_adv('artsy.net', 'artist', RE_ID)
URL_PARSE_DEVIANT_ART_ACCOUNT = re_social_path_adv('deviantart.com', RE_ID)
URL_PARSE_DEVIANT_ART_ACCOUNT_2 = re_social_subdomain('deviantart.com')
# TODO: https://<ID>.deviantart.com
REGEXES = [ REGEXES = [
# Reddit # Reddit
@ -214,6 +224,7 @@ REGEXES = [
# Youtube # Youtube
(URL_PARSE_YOUTUBE_CHANNEL_HANDLE_1, SocialSiteId.YOUTUBE_CHANNEL_HANDLE), (URL_PARSE_YOUTUBE_CHANNEL_HANDLE_1, SocialSiteId.YOUTUBE_CHANNEL_HANDLE),
(URL_PARSE_YOUTUBE_CHANNEL_HANDLE_2, SocialSiteId.YOUTUBE_CHANNEL_HANDLE), (URL_PARSE_YOUTUBE_CHANNEL_HANDLE_2, SocialSiteId.YOUTUBE_CHANNEL_HANDLE),
(URL_PARSE_YOUTUBE_CHANNEL_ID, SocialSiteId.YOUTUBE_CHANNEL_ID),
# Vimeo # Vimeo
(URL_PARSE_VIMEO_CHANNEL, SocialSiteId.VIMEO_CHANNEL), (URL_PARSE_VIMEO_CHANNEL, SocialSiteId.VIMEO_CHANNEL),
@ -223,8 +234,32 @@ REGEXES = [
# Artsy # Artsy
(URL_PARSE_ARTSY_ARTIST, SocialSiteId.ARTSY_ARTIST), (URL_PARSE_ARTSY_ARTIST, SocialSiteId.ARTSY_ARTIST),
# Deviant art
(URL_PARSE_DEVIANT_ART_ACCOUNT, SocialSiteId.DEVIANT_ART_ACCOUNT),
(URL_PARSE_DEVIANT_ART_ACCOUNT_2, SocialSiteId.DEVIANT_ART_ACCOUNT),
] ]
WELL_KNOWN_MASTODON_INSTANCES = frozenset({
# Includes all servers with 50 000+ users as of 6 july 2023.
# based on https://mastodonservers.net/servers/top
'mastodon.social',
'pawoo.net',
'baraag.net',
'mstdn.jp',
'mastodon.cloud',
'mstdn.social',
'mastodon.online',
'mas.to',
'mastodon.world',
'mastodon.lol',
'mastodon.sdf.org',
'c.im',
'mastodon.uno',
'mastodonapp.uk',
'fosstodon.org',
})
def determine_social_from_url_internally(url): def determine_social_from_url_internally(url):
assert isinstance(url, str) assert isinstance(url, str)
@ -235,6 +270,9 @@ def determine_social_from_url_internally(url):
return (social_site_id, groups[0] if len(groups) > 0 else None) return (social_site_id, groups[0] if len(groups) > 0 else None)
# Mastodon # Mastodon
for mastodon_hostname in WELL_KNOWN_MASTODON_INSTANCES:
if url.startswith('https://' + mastodon_hostname):
return (SocialSiteId.MASTODON_PAGE, None)
if 'mastodon' in url: if 'mastodon' in url:
return (SocialSiteId.MASTODON_PAGE, None) return (SocialSiteId.MASTODON_PAGE, None)
@ -266,6 +304,8 @@ def run_tests():
assert determine_social_from_url('https://www.tiktok.com/@depthsofwikipedia?lang=en').social_id == 'depthsofwikipedia' assert determine_social_from_url('https://www.tiktok.com/@depthsofwikipedia?lang=en').social_id == 'depthsofwikipedia'
assert determine_social_from_url('https://www.pixiv.net/users/14866303').social_id == '14866303' assert determine_social_from_url('https://www.pixiv.net/users/14866303').social_id == '14866303'
assert determine_social_from_url('https://www.pixiv.net/member.php?id=109710').social_id == '109710' assert determine_social_from_url('https://www.pixiv.net/member.php?id=109710').social_id == '109710'
assert determine_social_from_url('https://www.deviantart.com/solquiet').social_site_id == SocialSiteId.DEVIANT_ART_ACCOUNT
assert determine_social_from_url('https://solquiet.deviantart.com/').social_site_id == SocialSiteId.DEVIANT_ART_ACCOUNT
INSTAGRAMS = [ INSTAGRAMS = [
'https://instagram.com/_richardparry_', 'https://instagram.com/_richardparry_',