From b57fd28034d041f8024e54e12207f058ef609d30 Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Sun, 21 May 2023 22:07:17 +0200 Subject: [PATCH] Additional socials --- __init__.py | 177 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 139 insertions(+), 38 deletions(-) diff --git a/__init__.py b/__init__.py index 587df60..5707aa1 100644 --- a/__init__.py +++ b/__init__.py @@ -31,10 +31,22 @@ class SocialSiteId(Enum): ETSY_SHOP = 19 KO_FI = 20 BEHANCE_PAGE = 21 + TIKTOK_USER = 22 + PIXIV_USER = 23 + CARRD_PAGE = 24 + HENTAI_FOUNDRY = 25 + YOUTUBE_CHANNEL_HANDLE = 26 + VIMEO_CHANNEL = 27 + NEWGROUNDS_PAGE = 28 def wikidata_property(self, client): return client.get(WIKIDATA_PROPERTIES[self]) + def is_aggregator(self): + return self in AGGERAGOR_SOCIALS + +AGGERAGOR_SOCIALS = {SocialSiteId.LINKTREE_PAGE, SocialSiteId.WIKIDATA} + @enforce_types @dataclass(frozen = True) class SocialLink(object): @@ -42,45 +54,89 @@ class SocialLink(object): social_site_id: SocialSiteId social_id: Optional[str] -WIKIDATA_PROPERTIES = { - SocialSiteId.EMAIL: 968, - SocialSiteId.FACEBOOK_PAGE: 2013, - SocialSiteId.INSTAGRAM_PAGE: 2003, - SocialSiteId.LINKTREE_PAGE: 11079, - SocialSiteId.REDDIT_SUBREDDIT: 3984, - SocialSiteId.REDDIT_USER: 4265, - SocialSiteId.RSS_FEED: 1019, - SocialSiteId.SONGKICK_ARTIST: 3478, - SocialSiteId.TWITCH: 5797, - SocialSiteId.TWITTER: 2002, - SocialSiteId.WIKIDATA: 43649390, +@enforce_types +@dataclass(frozen = True) +class WikidataInfo(object): + property_id: Optional[int] + issuer_id: Optional[int] - SocialSiteId.TUMBLR: 3943, +WIKIDATA_PROPERTIES = { + SocialSiteId.EMAIL: WikidataInfo(968, None), + SocialSiteId.FACEBOOK_PAGE: WikidataInfo(2013, None), + SocialSiteId.INSTAGRAM_PAGE: WikidataInfo(2003, None), + SocialSiteId.LINKTREE_PAGE: WikidataInfo(11079, None), + SocialSiteId.REDDIT_SUBREDDIT: WikidataInfo(3984, None), + SocialSiteId.REDDIT_USER: WikidataInfo(4265, None), + SocialSiteId.RSS_FEED: WikidataInfo(1019, None), + SocialSiteId.SONGKICK_ARTIST: WikidataInfo(3478, None), + SocialSiteId.TWITCH: WikidataInfo(5797, None), + SocialSiteId.TWITTER: WikidataInfo(2002, None), + SocialSiteId.WIKIDATA: WikidataInfo(43649390, None), + + SocialSiteId.TUMBLR: WikidataInfo(3943, None), + SocialSiteId.TIKTOK_USER: WikidataInfo(7085, None), + + SocialSiteId.PIXIV_USER: WikidataInfo(None, 306956), #SocialSiteId.MASTODON_PAGE: 2000 + 10, - #SocialSiteId.PATREON_PAGE: 2000 + 12, - #SocialSiteId.ARTSTATION_PAGE: 2000 + 13, - #SocialSiteId.INPRNT_PAGE: 2000 + 14, + SocialSiteId.PATREON_PAGE: WikidataInfo(4175, 15861362), + SocialSiteId.ARTSTATION_PAGE: WikidataInfo(None, 65551500), + #SocialSiteId.INPRNT_PAGE: WikidataInfo(None, None), + + SocialSiteId.CARRD_PAGE: WikidataInfo(None, 106036503), + SocialSiteId.HENTAI_FOUNDRY: WikidataInfo(None, 115903301), + SocialSiteId.YOUTUBE_CHANNEL_HANDLE: WikidataInfo(11245, 866), + SocialSiteId.VIMEO_CHANNEL: WikidataInfo(4015, 156376), + SocialSiteId.NEWGROUNDS_PAGE: WikidataInfo(None, 263655), } -REDDIT_SUBREDDIT_URL = r'^(?:https?:\/\/)?(?:old\.)?reddit\.com\/r\/(\w+)\/?$' -REDDIT_USER_URL = r'^(?:https?:\/\/)?(?:old\.)?reddit\.com\/user\/(\w+)(?:|\/submitted)\/?$' -TWITTER_HANDLE_URL = r'^(?:https?:\/\/)?(?:www\.)?twitter\.com\/(\w+)\/?$' -LINKTREE_PAGE_URL = r'^(?:https?:\/\/)?(?:www\.)?linktr\.ee\/(\w+)\/?$' -TWITCH_STREAM_URL = r'^(?:https?:\/\/)?(?:www\.)?twitch\.tv\/(\w+)\/?$' -WIKIDATA_ITEM_URL = r'^(?:https?:\/\/)?(?:www\.)?wikidata\.org\/wiki\/(\w+)\/?$' -SONGKICK_ARTIST_URL = r'^(?:https?:\/\/)?(?:www\.)?songkick\.com\/artists\/(\d+)([\w-]*)\/?$' -TUMBLR_PAGE_URL = r'^(?:https?:\/\/)?(?:www\.)?tumblr\.com\/([\w-]+)(?:\/|\/rss)?\/?$' -TUMBLR_PAGE_URL_2 = r'^(?:https?:\/\/)?([\w-]+)\.tumblr\.com\/?$' -INSTAGRAM_URL = r'^(?:https?:\/\/)?(?:www\.)?instagram\.com\/([\w_.-]+)\/?$' -PATREON_URL = r'^(?:https?:\/\/)?(?:www\.)?patreon\.com\/([\w-]+)\/?$' -ARTSTATION_URL = r'^(?:https?:\/\/)?(?:www\.)?artstation\.com\/([\w-]+)\/?$' -INPRNT_URL = r'^(?:https?:\/\/)?(?:www\.)?inprnt\.com\/gallery\/([\w-]+)\/?$' +def re_social_subdomain(main_domain): + #return r'^(?:https?:\/\/)?([\w_-]+)\.'+re.escape(main_domain)+'\/?$' + return r'^(?:https?:\/\/)?([\w_-]+)\.'+re.escape(main_domain)+'(\/.*)?$' + +RE_ID = r'@?([\w_.-]+)' + +def re_social_path(main_domain): + #return r'^(?:https?:\/\/)?(?:www\.)?'+re.escape(main_domain)+'\/'+RE_ID+'\/?$' + return re_social_path_adv(main_domain, RE_ID) + +def re_social_path_adv(main_domain, *path): + l = [r'^', '(?:https?:\/\/)?', r'(?:www\.)?', re.escape(main_domain)] + + for p in path: + l.append(r'\/') + l.append(RE_ID if p == RE_ID else re.escape(p)) + l.append(r'\/?$') + return ''.join(l) + MAILTO_URL = r'^mailto:(?:[\w._.]+@[\w._.]+)$' -FACEBOOK_PAGE_URL = r'^(?:https?:\/\/)?(?:www\.)?facebook\.com\/([\w-]+)\/?$' -SUBSTACK_PREFIX_URL = r'^(?:https?:\/\/)?(\w+)\.substack\.com\/?$' -ETSY_SHOP_URL = r'^(?:https?:\/\/)?(?:www\.)?etsy\.com\/shop\/([\w-]+)\/?$' -KO_FI_URL = r'^(?:https?:\/\/)?(?:www\.)?ko\-fi\.com\/([\w-]+)(?:|\/shop)\/?$' -BEHANCE_PAGE_URL = r'^(?:https?:\/\/)?(?:www\.)?behance\.net\/([\w-]+)\/?$' + +REDDIT_SUBREDDIT_URL = r'^(?:https?:\/\/)?(?:old\.)?reddit\.com\/r\/([\w-]+)\/?$' +REDDIT_USER_URL = r'^(?:https?:\/\/)?(?:old\.)?reddit\.com\/user\/([\w-]+)(?:|\/submitted)\/?$' +TWITTER_HANDLE_URL =re_social_path('twitter.com') +LINKTREE_PAGE_URL = re_social_path('linktr.ee') +TWITCH_STREAM_URL = re_social_path('twitch.tv') +WIKIDATA_ITEM_URL = re_social_path_adv('wikidata.org', 'wiki', RE_ID) +SONGKICK_ARTIST_URL = r'^(?:https?:\/\/)?(?:www\.)?songkick\.com\/artists\/(\d+)([\w-]*)\/?$' +TUMBLR_PAGE_URL = re_social_path('tumblr.com') +TUMBLR_PAGE_URL_2 = re_social_subdomain('tumblr.com') +INSTAGRAM_URL = re_social_path('instagram.com') +PATREON_URL = re_social_path('patreon.com') +ARTSTATION_URL = re_social_path('artstation.com') +INPRNT_URL = re_social_path_adv('inprnt.com', 'gallery', RE_ID) +FACEBOOK_PAGE_URL = re_social_path('facebook.com') +SUBSTACK_PREFIX_URL = re_social_subdomain('substack.com') +ETSY_SHOP_URL = re_social_path_adv('etsy.com', 'shop', RE_ID) +BEHANCE_PAGE_URL = re_social_path('behance.net') +TIKTOK_USER_URL = re_social_path('tiktok.com') +PIXIV_USER_URL = r'^(?:https?:\/\/)?(?:www\.)?pixiv\.net(?:\/en)?\/users/(\d+)\/?$' +PIXIV_USER_URL_2 = r'^(?:https?:\/\/)?(?:www\.)?pixiv\.net(?:\/en)?\/member\.php\/?[?]id=(\d+)$' + +URL_PARSE_CARRD_PAGE = re_social_subdomain('carrd.co') +URL_PARSE_HENTAI_FOUNDRY= re_social_path_adv('hentai-foundry.com', 'user', RE_ID, 'profile') +URL_PARSE_YOUTUBE_CHANNEL_HANDLE_1= re_social_path_adv('youtube.com', RE_ID) +URL_PARSE_YOUTUBE_CHANNEL_HANDLE_2= re_social_path_adv('youtube.com', 'c', RE_ID) +URL_PARSE_VIMEO_CHANNEL= re_social_path_adv('vimeo.com', RE_ID) +URL_PARSE_NEWGROUNDS_PAGE=re_social_subdomain('newgrounds.com') REGEXES = [ # Reddit @@ -112,6 +168,13 @@ REGEXES = [ # Instagram (INSTAGRAM_URL, SocialSiteId.INSTAGRAM_PAGE), + # Tiktok + (TIKTOK_USER_URL, SocialSiteId.TIKTOK_USER), + + # Pixiv + (PIXIV_USER_URL, SocialSiteId.PIXIV_USER), + (PIXIV_USER_URL_2, SocialSiteId.PIXIV_USER), + # Patreon (PATREON_URL, SocialSiteId.PATREON_PAGE), @@ -131,10 +194,27 @@ REGEXES = [ (ETSY_SHOP_URL, SocialSiteId.ETSY_SHOP), # Ko-fi - (KO_FI_URL, SocialSiteId.KO_FI), + (re_social_path_adv('ko-fi.com', RE_ID), SocialSiteId.KO_FI), + (re_social_path_adv('ko-fi.com', RE_ID, 'shop'), SocialSiteId.KO_FI), # Behance (BEHANCE_PAGE_URL, SocialSiteId.BEHANCE_PAGE), + + # Carrd + (URL_PARSE_CARRD_PAGE, SocialSiteId.CARRD_PAGE), + + # Hentai-Foundry + (URL_PARSE_HENTAI_FOUNDRY, SocialSiteId.HENTAI_FOUNDRY), + + # Youtube + (URL_PARSE_YOUTUBE_CHANNEL_HANDLE_1, SocialSiteId.YOUTUBE_CHANNEL_HANDLE), + (URL_PARSE_YOUTUBE_CHANNEL_HANDLE_2, SocialSiteId.YOUTUBE_CHANNEL_HANDLE), + + # Vimeo + (URL_PARSE_VIMEO_CHANNEL, SocialSiteId.VIMEO_CHANNEL), + + # Newgrounds + (URL_PARSE_NEWGROUNDS_PAGE, SocialSiteId.NEWGROUNDS_PAGE), ] def determine_social_from_url_internally(url): @@ -158,17 +238,38 @@ def determine_social_from_url_internally(url): def determine_social_from_url(url): if isinstance(url, str): url = urllib.parse.urlparse(url) - (social_site_id, social_id) = determine_social_from_url_internally(url.geturl()) + (social_site_id, social_id) = determine_social_from_url_internally(url._replace(query = '', fragment = '').geturl()) + if not social_site_id: + (social_site_id, social_id) = determine_social_from_url_internally(url._replace(fragment = '').geturl()) if not social_site_id: return None return SocialLink(url, social_site_id, social_id) -TEST = True -if TEST: +def run_tests(): assert determine_social_from_url('http://www.twitter.com/dril').social_id == 'dril' assert determine_social_from_url('http://worstdril.tumblr.com/') assert determine_social_from_url('https://deep-dark-fears.tumblr.com').social_id == 'deep-dark-fears' assert determine_social_from_url('https://www.etsy.com/shop/aleksiremesart').social_id == 'aleksiremesart' assert determine_social_from_url('https://ko-fi.com/A627LI1/shop').social_id == 'A627LI1' assert determine_social_from_url('https://ko-fi.com/A627LI1/').social_id == 'A627LI1' + assert determine_social_from_url('https://www.facebook.com/fredagscafeen.dk/').social_id == 'fredagscafeen.dk' + assert determine_social_from_url('https://www.tiktok.com/@depthsofwikipedia?lang=en').social_id == 'depthsofwikipedia' + assert determine_social_from_url('https://www.pixiv.net/users/14866303').social_id == '14866303' + assert determine_social_from_url('https://www.pixiv.net/member.php?id=109710').social_id == '109710' + + INSTAGRAMS = [ + 'https://instagram.com/_richardparry_', + 'https://instagram.com/j_kmor/', + 'https://instagram.com/cullensartbox/', + 'https://www.instagram.com/timkongart/', + 'https://www.instagram.com/kcn.wu/', + 'https://www.instagram.com/itsbettyjiang', + ] + + for ig in INSTAGRAMS: + assert determine_social_from_url(ig).social_site_id == SocialSiteId.INSTAGRAM_PAGE + +TEST = True +if TEST: + run_tests()