From 2bdd5b43fa524c2004a1ff2a37b72a32e73650db Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Sat, 2 Sep 2023 18:29:56 +0200 Subject: [PATCH] Pixiv has too many link formats --- __init__.py | 68 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 17 deletions(-) diff --git a/__init__.py b/__init__.py index 1f13937..d017091 100644 --- a/__init__.py +++ b/__init__.py @@ -32,7 +32,8 @@ class SocialSiteId(enum.Enum): KO_FI = 20 BEHANCE_PAGE = 21 TIKTOK_USER = 7085 - PIXIV_USER = 23 + PIXIV_USER_ID = 5435 + PIXIV_USER_NICKNAME = 31 CARRD_PAGE = 24 HENTAI_FOUNDRY = 25 YOUTUBE_CHANNEL_HANDLE = 26 @@ -42,6 +43,8 @@ class SocialSiteId(enum.Enum): ARTSY_ARTIST = 2042 LINK_COLLECTION_PAGE = 29 DEVIANT_ART_ACCOUNT = 7737 + DANBOORU_ARTIST = 30 + BANDCAMP_PROFILE = 3283 def wikidata_property(self, client): return client.get(WIKIDATA_PROPERTIES[self]) @@ -49,7 +52,13 @@ class SocialSiteId(enum.Enum): def is_aggregator(self): return self in AGGERAGOR_SOCIALS -AGGERAGOR_SOCIALS = {SocialSiteId.LINKTREE_PAGE, SocialSiteId.WIKIDATA} +AGGERAGOR_SOCIALS = { + SocialSiteId.LINKTREE_PAGE, + SocialSiteId.WIKIDATA, + SocialSiteId.CARRD_PAGE, + SocialSiteId.LINK_COLLECTION_PAGE, + SocialSiteId.DANBOORU_ARTIST, +} @enforce_types @dataclass(frozen = True) @@ -63,6 +72,8 @@ class SocialLink(object): class WikidataInfo(object): property_id: Optional[int] issuer_id: Optional[int] + id_version_of: Optional[SocialSiteId] = None + nickname_version_of: Optional[SocialSiteId] = None WIKIDATA_PROPERTIES = { SocialSiteId.EMAIL: WikidataInfo(968, None), @@ -82,7 +93,9 @@ WIKIDATA_PROPERTIES = { SocialSiteId.TUMBLR: WikidataInfo(3943, None), SocialSiteId.TIKTOK_USER: WikidataInfo(7085, None), - SocialSiteId.PIXIV_USER: WikidataInfo(None, 306956), + SocialSiteId.PIXIV_USER_ID: WikidataInfo(5435, 306956, id_version_of = SocialSiteId.PIXIV_USER_NICKNAME), + SocialSiteId.PIXIV_USER_NICKNAME: WikidataInfo(None, 306956, nickname_version_of = SocialSiteId.PIXIV_USER_ID), + SocialSiteId.MASTODON_PAGE: WikidataInfo(4033, None), SocialSiteId.PATREON_PAGE: WikidataInfo(4175, 15861362), SocialSiteId.ARTSTATION_PAGE: WikidataInfo(None, 65551500), @@ -90,12 +103,14 @@ WIKIDATA_PROPERTIES = { SocialSiteId.CARRD_PAGE: WikidataInfo(None, 106036503), SocialSiteId.HENTAI_FOUNDRY: WikidataInfo(None, 115903301), - SocialSiteId.YOUTUBE_CHANNEL_HANDLE: WikidataInfo(11245, 866), - SocialSiteId.YOUTUBE_CHANNEL_ID: WikidataInfo(2397, None), + SocialSiteId.YOUTUBE_CHANNEL_HANDLE: WikidataInfo(11245, 866, nickname_version_of = SocialSiteId.YOUTUBE_CHANNEL_ID), + SocialSiteId.YOUTUBE_CHANNEL_ID: WikidataInfo(2397, 866, id_version_of = SocialSiteId.YOUTUBE_CHANNEL_HANDLE), SocialSiteId.VIMEO_CHANNEL: WikidataInfo(4015, 156376), SocialSiteId.NEWGROUNDS_PAGE: WikidataInfo(None, 263655), SocialSiteId.ARTSY_ARTIST: WikidataInfo(2042, 4796642), SocialSiteId.DEVIANT_ART_ACCOUNT: WikidataInfo(7737, None), + SocialSiteId.DANBOORU_ARTIST: WikidataInfo(None, 64514853), + SocialSiteId.BANDCAMP_PROFILE: WikidataInfo(3283, 545966), } def re_social_subdomain(main_domain): @@ -103,6 +118,7 @@ def re_social_subdomain(main_domain): return r'^(?:https?:\/\/)?([\w_-]+)\.'+re.escape(main_domain)+'(\/.*)?$' RE_ID = r'@?([^/]+)' +RE_ANY_SUBPATH = r'(|\/|\/.*)$' def re_social_path(main_domain): #return r'^(?:https?:\/\/)?(?:www\.)?'+re.escape(main_domain)+'\/'+RE_ID+'\/?$' @@ -113,10 +129,13 @@ def re_social_path_adv(main_domain, *path): l = [r'^', '(?:https?:\/\/)?', r'(?:www\.)?', re.escape(main_domain)] for p in path: - l.append(r'\/') - l.append(RE_ID if p == RE_ID else re.escape(p)) - l.append(r'\/?$') - return ''.join(l) + if p != RE_ANY_SUBPATH: + l.append(r'\/') + l.append(p if p in {RE_ID, RE_ANY_SUBPATH} else re.escape(p)) + if path[-1] != RE_ANY_SUBPATH: + l.append(r'\/?$') + regex = ''.join(l) + return regex MAILTO_URL = r'^mailto:(?:[\w._.]+@[\w._.]+)$' @@ -130,7 +149,7 @@ SONGKICK_ARTIST_URL = r'^(?:https?:\/\/)?(?:www\.)?songkick\.com\/artists\/(\d+) TUMBLR_PAGE_URL = re_social_path('tumblr.com') TUMBLR_PAGE_URL_2 = re_social_subdomain('tumblr.com') INSTAGRAM_URL = re_social_path('instagram.com') -PATREON_URL = re_social_path('patreon.com') +PATREON_URL = re_social_path_adv('patreon.com', RE_ID, RE_ANY_SUBPATH) ARTSTATION_URL = re_social_path('artstation.com') INPRNT_URL = re_social_path_adv('inprnt.com', 'gallery', RE_ID) FACEBOOK_PAGE_URL = re_social_path('facebook.com') @@ -138,11 +157,14 @@ SUBSTACK_PREFIX_URL = re_social_subdomain('substack.com') ETSY_SHOP_URL = re_social_path_adv('etsy.com', 'shop', RE_ID) BEHANCE_PAGE_URL = re_social_path('behance.net') TIKTOK_USER_URL = re_social_path('tiktok.com') -PIXIV_USER_URL = r'^(?:https?:\/\/)?(?:www\.)?pixiv\.net(?:\/en)?\/users/(\d+)\/?$' -PIXIV_USER_URL_2 = r'^(?:https?:\/\/)?(?:www\.)?pixiv\.net(?:\/en)?\/member\.php\/?[?]id=(\d+)$' +PIXIV_USER_ID_URL = r'^(?:https?:\/\/)?(?:www\.)?pixiv\.net(?:\/en)?\/users/(\d+)\/?$' +PIXIV_USER_ID_URL_2 = r'^(?:https?:\/\/)?(?:www\.)?pixiv\.net(?:\/en)?\/member\.php\/?[?]id=(\d+)$' +PIXIV_FANBOX_USER_NICKNAME_URL = re_social_subdomain('fanbox.cc') +PIXIV_USER_NICKNAME_URL = re_social_path_adv('pixiv.net', 'stacc', RE_ID) +PIXIV_SKETCH_USER_NICKNAME_URL = re_social_path_adv('sketch.pixiv.net', RE_ID) URL_PARSE_CARRD_PAGE = re_social_subdomain('carrd.co') -URL_PARSE_HENTAI_FOUNDRY= re_social_path_adv('hentai-foundry.com', 'user', RE_ID, 'profile') +URL_PARSE_HENTAI_FOUNDRY = re_social_path_adv('hentai-foundry.com', 'user', RE_ID, RE_ANY_SUBPATH) URL_PARSE_YOUTUBE_CHANNEL_HANDLE_1= re_social_path_adv('youtube.com', RE_ID) URL_PARSE_YOUTUBE_CHANNEL_HANDLE_2= re_social_path_adv('youtube.com', 'c', RE_ID) URL_PARSE_YOUTUBE_CHANNEL_ID= re_social_path_adv('youtube.com', 'channel', RE_ID) @@ -151,6 +173,9 @@ URL_PARSE_NEWGROUNDS_PAGE = re_social_subdomain('newgrounds.com') URL_PARSE_ARTSY_ARTIST = re_social_path_adv('artsy.net', 'artist', RE_ID) URL_PARSE_DEVIANT_ART_ACCOUNT = re_social_path_adv('deviantart.com', RE_ID) URL_PARSE_DEVIANT_ART_ACCOUNT_2 = re_social_subdomain('deviantart.com') +URL_PARSE_DANBOORU_ARTIST = re_social_path_adv('danbooru.donmai.us', 'artists', RE_ID) +URL_PARSE_BANDCAMP = re_social_subdomain('bandcamp.com') + # TODO: https://.deviantart.com REGEXES = [ @@ -187,8 +212,11 @@ REGEXES = [ (TIKTOK_USER_URL, SocialSiteId.TIKTOK_USER), # Pixiv - (PIXIV_USER_URL, SocialSiteId.PIXIV_USER), - (PIXIV_USER_URL_2, SocialSiteId.PIXIV_USER), + (PIXIV_USER_ID_URL, SocialSiteId.PIXIV_USER_ID), + (PIXIV_USER_ID_URL_2, SocialSiteId.PIXIV_USER_ID), + (PIXIV_FANBOX_USER_NICKNAME_URL , SocialSiteId.PIXIV_USER_NICKNAME), + (PIXIV_USER_NICKNAME_URL , SocialSiteId.PIXIV_USER_NICKNAME), + (PIXIV_SKETCH_USER_NICKNAME_URL, SocialSiteId.PIXIV_USER_NICKNAME), # Patreon (PATREON_URL, SocialSiteId.PATREON_PAGE), @@ -238,6 +266,12 @@ REGEXES = [ # Deviant art (URL_PARSE_DEVIANT_ART_ACCOUNT, SocialSiteId.DEVIANT_ART_ACCOUNT), (URL_PARSE_DEVIANT_ART_ACCOUNT_2, SocialSiteId.DEVIANT_ART_ACCOUNT), + + # Danbooru + (URL_PARSE_DANBOORU_ARTIST, SocialSiteId.DANBOORU_ARTIST), + + # Bandcamp + (URL_PARSE_BANDCAMP, SocialSiteId.BANDCAMP_PROFILE), ] WELL_KNOWN_MASTODON_INSTANCES = frozenset({ @@ -260,12 +294,12 @@ WELL_KNOWN_MASTODON_INSTANCES = frozenset({ 'fosstodon.org', }) -def determine_social_from_url_internally(url): +def determine_social_from_url_internally(url: str): assert isinstance(url, str) # Regexes for (social_site_url_regex, social_site_id) in REGEXES: - if m := re.match(social_site_url_regex, url, re.I): + if m := re.fullmatch(social_site_url_regex, url, re.I): groups = m.groups() return (social_site_id, groups[0] if len(groups) > 0 else None)