From 9b7e4680bf7bab68a99f4ddfd8f0e98b480ed5bd Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Mon, 2 Jan 2023 23:47:03 +0100 Subject: [PATCH] Additional social sites --- __init__.py | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/__init__.py b/__init__.py index 10d7938..bc76607 100644 --- a/__init__.py +++ b/__init__.py @@ -23,6 +23,11 @@ class SocialSiteId(Enum): INPRNT_PAGE = 14 FACEBOOK_PAGE = 15 EMAIL = 16 + JSON_LD = 17 # Similar to PAGE_WATCH, but focused on embedded microdata + SUBSTACK = 18 + + def wikidata_property(self, client): + return client.get(WIKIDATA_PROPERTIES[self]) @enforce_types @dataclass(frozen = True) @@ -31,6 +36,25 @@ class SocialLink(object): social_site_id: SocialSiteId social_id: Optional[str] +WIKIDATA_PROPERTIES = { + SocialSiteId.EMAIL: 968, + SocialSiteId.FACEBOOK_PAGE: 2013, + SocialSiteId.INSTAGRAM_PAGE: 2003, + SocialSiteId.LINKTREE_PAGE: 11079, + SocialSiteId.REDDIT: 3984, + SocialSiteId.RSS_FEED: 1019, + SocialSiteId.SONGKICK_ARTIST: 3478, + SocialSiteId.TWITCH: 5797, + SocialSiteId.TWITTER: 2002, + SocialSiteId.WIKIDATA: 43649390, + + #SocialSiteId.TUMBLR: 2000 + 9, + #SocialSiteId.MASTODON_PAGE: 2000 + 10, + #SocialSiteId.PATREON_PAGE: 2000 + 12, + #SocialSiteId.ARTSTATION_PAGE: 2000 + 13, + #SocialSiteId.INPRNT_PAGE: 2000 + 14, +} + REDDIT_SUBSCRIPTION_URL = r'^(?:https?:\/\/)?(?:old\.)?reddit\.com\/r\/(\w+)\/?$' TWITTER_HANDLE_URL = r'^(?:https?:\/\/)?(?:www\.)?twitter\.com\/(\w+)\/?$' LINKTREE_PAGE_URL = r'^(?:https?:\/\/)?(?:www\.)?linktr\.ee\/(\w+)\/?$' @@ -39,12 +63,13 @@ WIKIDATA_ITEM_URL = r'^(?:https?:\/\/)?(?:www\.)?wikidata\.org\/wiki\/(\w+)\/?$' SONGKICK_ARTIST_URL = r'^(?:https?:\/\/)?(?:www\.)?songkick\.com\/artists\/(\d+)([\w-]*)\/?$' TUMBLR_PAGE_URL = r'^(?:https?:\/\/)?(?:www\.)?tumblr\.com\/([\w-]+)(?:\/|\/rss)?\/?$' TUMBLR_PAGE_URL_2 = r'^(?:https?:\/\/)?(\w+)\.tumblr\.com\/?$' -INSTAGRAM_URL = r'^(?:https?:\/\/)?(?:www\.)?instagram\.com\/([\w_-]+)\/?$' +INSTAGRAM_URL = r'^(?:https?:\/\/)?(?:www\.)?instagram\.com\/([\w_.-]+)\/?$' PATREON_URL = r'^(?:https?:\/\/)?(?:www\.)?patreon\.com\/([\w-]+)\/?$' ARTSTATION_URL = r'^(?:https?:\/\/)?(?:www\.)?artstation\.com\/([\w-]+)\/?$' INPRNT_URL = r'^(?:https?:\/\/)?(?:www\.)?inprnt\.com\/gallery\/([\w-]+)\/?$' -MAILTO_URL = r'^mailto:([\w._.]+@[\w._.]+)$' +MAILTO_URL = r'^mailto:(?:[\w._.]+@[\w._.]+)$' FACEBOOK_PAGE_URL = r'^(?:https?:\/\/)?(?:www\.)?facebook\.com\/([\w-]+)\/?$' +SUBSTACK_PREFIX_URL = r'^(?:https?:\/\/)?(\w+)\.substack\.com\/?$' REGEXES = [ # Subreddits @@ -86,6 +111,9 @@ REGEXES = [ # Email (MAILTO_URL, SocialSiteId.EMAIL), + + # Substack + (SUBSTACK_PREFIX_URL, SocialSiteId.SUBSTACK), ] def determine_social_from_url_internally(url): @@ -93,7 +121,8 @@ def determine_social_from_url_internally(url): # Regexes for (social_site_url_regex, social_site_id) in REGEXES: if m := re.match(social_site_url_regex, url): - return (social_site_id, m.group(1)) + groups = m.groups() + return (social_site_id, groups[0] if len(groups) > 0 else None) # Mastodon if 'mastodon' in url: @@ -106,12 +135,13 @@ def determine_social_from_url_internally(url): return (None, None) def determine_social_from_url(url): - parsed_url = urllib.parse.urlparse(url) - (social_site_id, social_id) = determine_social_from_url_internally(url) + if isinstance(url, str): + url = urllib.parse.urlparse(url) + (social_site_id, social_id) = determine_social_from_url_internally(url.geturl()) if not social_site_id: return None - return SocialLink(parsed_url, social_site_id, social_id) + return SocialLink(url, social_site_id, social_id) assert determine_social_from_url('http://www.twitter.com/dril').social_id == 'dril' assert determine_social_from_url('http://worstdril.tumblr.com/')