From bdf60219fdba7cf9ab57ccdaabc203cdf74826d7 Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Sun, 6 Aug 2023 00:35:03 +0200 Subject: [PATCH] Additional mastodon pages and deviant art parsing. --- __init__.py | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/__init__.py b/__init__.py index 642ab7c..1f13937 100644 --- a/__init__.py +++ b/__init__.py @@ -36,9 +36,12 @@ class SocialSiteId(enum.Enum): CARRD_PAGE = 24 HENTAI_FOUNDRY = 25 YOUTUBE_CHANNEL_HANDLE = 26 + YOUTUBE_CHANNEL_ID = 2397 VIMEO_CHANNEL = 27 NEWGROUNDS_PAGE = 28 ARTSY_ARTIST = 2042 + LINK_COLLECTION_PAGE = 29 + DEVIANT_ART_ACCOUNT = 7737 def wikidata_property(self, client): return client.get(WIKIDATA_PROPERTIES[self]) @@ -80,7 +83,7 @@ WIKIDATA_PROPERTIES = { SocialSiteId.TIKTOK_USER: WikidataInfo(7085, None), SocialSiteId.PIXIV_USER: WikidataInfo(None, 306956), - #SocialSiteId.MASTODON_PAGE: 2000 + 10, + SocialSiteId.MASTODON_PAGE: WikidataInfo(4033, None), SocialSiteId.PATREON_PAGE: WikidataInfo(4175, 15861362), SocialSiteId.ARTSTATION_PAGE: WikidataInfo(None, 65551500), #SocialSiteId.INPRNT_PAGE: WikidataInfo(None, None), @@ -88,9 +91,11 @@ WIKIDATA_PROPERTIES = { SocialSiteId.CARRD_PAGE: WikidataInfo(None, 106036503), SocialSiteId.HENTAI_FOUNDRY: WikidataInfo(None, 115903301), SocialSiteId.YOUTUBE_CHANNEL_HANDLE: WikidataInfo(11245, 866), + SocialSiteId.YOUTUBE_CHANNEL_ID: WikidataInfo(2397, None), SocialSiteId.VIMEO_CHANNEL: WikidataInfo(4015, 156376), SocialSiteId.NEWGROUNDS_PAGE: WikidataInfo(None, 263655), SocialSiteId.ARTSY_ARTIST: WikidataInfo(2042, 4796642), + SocialSiteId.DEVIANT_ART_ACCOUNT: WikidataInfo(7737, None), } def re_social_subdomain(main_domain): @@ -104,6 +109,7 @@ def re_social_path(main_domain): return re_social_path_adv(main_domain, RE_ID) def re_social_path_adv(main_domain, *path): + assert not main_domain.startswith('www.'), 'Redundant www.' l = [r'^', '(?:https?:\/\/)?', r'(?:www\.)?', re.escape(main_domain)] for p in path: @@ -139,9 +145,13 @@ URL_PARSE_CARRD_PAGE = re_social_subdomain('carrd.co') URL_PARSE_HENTAI_FOUNDRY= re_social_path_adv('hentai-foundry.com', 'user', RE_ID, 'profile') URL_PARSE_YOUTUBE_CHANNEL_HANDLE_1= re_social_path_adv('youtube.com', RE_ID) URL_PARSE_YOUTUBE_CHANNEL_HANDLE_2= re_social_path_adv('youtube.com', 'c', RE_ID) +URL_PARSE_YOUTUBE_CHANNEL_ID= re_social_path_adv('youtube.com', 'channel', RE_ID) URL_PARSE_VIMEO_CHANNEL= re_social_path_adv('vimeo.com', RE_ID) URL_PARSE_NEWGROUNDS_PAGE = re_social_subdomain('newgrounds.com') URL_PARSE_ARTSY_ARTIST = re_social_path_adv('artsy.net', 'artist', RE_ID) +URL_PARSE_DEVIANT_ART_ACCOUNT = re_social_path_adv('deviantart.com', RE_ID) +URL_PARSE_DEVIANT_ART_ACCOUNT_2 = re_social_subdomain('deviantart.com') +# TODO: https://.deviantart.com REGEXES = [ # Reddit @@ -214,6 +224,7 @@ REGEXES = [ # Youtube (URL_PARSE_YOUTUBE_CHANNEL_HANDLE_1, SocialSiteId.YOUTUBE_CHANNEL_HANDLE), (URL_PARSE_YOUTUBE_CHANNEL_HANDLE_2, SocialSiteId.YOUTUBE_CHANNEL_HANDLE), + (URL_PARSE_YOUTUBE_CHANNEL_ID, SocialSiteId.YOUTUBE_CHANNEL_ID), # Vimeo (URL_PARSE_VIMEO_CHANNEL, SocialSiteId.VIMEO_CHANNEL), @@ -223,8 +234,32 @@ REGEXES = [ # Artsy (URL_PARSE_ARTSY_ARTIST, SocialSiteId.ARTSY_ARTIST), + + # Deviant art + (URL_PARSE_DEVIANT_ART_ACCOUNT, SocialSiteId.DEVIANT_ART_ACCOUNT), + (URL_PARSE_DEVIANT_ART_ACCOUNT_2, SocialSiteId.DEVIANT_ART_ACCOUNT), ] +WELL_KNOWN_MASTODON_INSTANCES = frozenset({ + # Includes all servers with 50 000+ users as of 6 july 2023. + # based on https://mastodonservers.net/servers/top + 'mastodon.social', + 'pawoo.net', + 'baraag.net', + 'mstdn.jp', + 'mastodon.cloud', + 'mstdn.social', + 'mastodon.online', + 'mas.to', + 'mastodon.world', + 'mastodon.lol', + 'mastodon.sdf.org', + 'c.im', + 'mastodon.uno', + 'mastodonapp.uk', + 'fosstodon.org', +}) + def determine_social_from_url_internally(url): assert isinstance(url, str) @@ -235,6 +270,9 @@ def determine_social_from_url_internally(url): return (social_site_id, groups[0] if len(groups) > 0 else None) # Mastodon + for mastodon_hostname in WELL_KNOWN_MASTODON_INSTANCES: + if url.startswith('https://' + mastodon_hostname): + return (SocialSiteId.MASTODON_PAGE, None) if 'mastodon' in url: return (SocialSiteId.MASTODON_PAGE, None) @@ -266,6 +304,8 @@ def run_tests(): assert determine_social_from_url('https://www.tiktok.com/@depthsofwikipedia?lang=en').social_id == 'depthsofwikipedia' assert determine_social_from_url('https://www.pixiv.net/users/14866303').social_id == '14866303' assert determine_social_from_url('https://www.pixiv.net/member.php?id=109710').social_id == '109710' + assert determine_social_from_url('https://www.deviantart.com/solquiet').social_site_id == SocialSiteId.DEVIANT_ART_ACCOUNT + assert determine_social_from_url('https://solquiet.deviantart.com/').social_site_id == SocialSiteId.DEVIANT_ART_ACCOUNT INSTAGRAMS = [ 'https://instagram.com/_richardparry_',