commit 30d927ae680d6451fb774d21986722f0fdd44924 Author: Jon Michael Aanes Date: Thu Dec 22 17:19:52 2022 +0100 Initial socials parsing system diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..10d7938 --- /dev/null +++ b/__init__.py @@ -0,0 +1,117 @@ +from dataclasses import dataclass +from enforce_typing import enforce_types +from typing import List, Set, Optional, Union +from enum import Enum +import datetime +import re +import urllib.parse + +class SocialSiteId(Enum): + REDDIT = 1 + TWITTER = 2 + RSS_FEED = 3 + PAGE_WATCH = 4 + LINKTREE_PAGE = 5 + TWITCH = 6 + WIKIDATA = 7 + SONGKICK_ARTIST = 8 + TUMBLR = 9 + MASTODON_PAGE = 10 + INSTAGRAM_PAGE = 11 + PATREON_PAGE = 12 + ARTSTATION_PAGE = 13 + INPRNT_PAGE = 14 + FACEBOOK_PAGE = 15 + EMAIL = 16 + +@enforce_types +@dataclass(frozen = True) +class SocialLink(object): + url: urllib.parse.ParseResult + social_site_id: SocialSiteId + social_id: Optional[str] + +REDDIT_SUBSCRIPTION_URL = r'^(?:https?:\/\/)?(?:old\.)?reddit\.com\/r\/(\w+)\/?$' +TWITTER_HANDLE_URL = r'^(?:https?:\/\/)?(?:www\.)?twitter\.com\/(\w+)\/?$' +LINKTREE_PAGE_URL = r'^(?:https?:\/\/)?(?:www\.)?linktr\.ee\/(\w+)\/?$' +TWITCH_STREAM_URL = r'^(?:https?:\/\/)?(?:www\.)?twitch\.tv\/(\w+)\/?$' +WIKIDATA_ITEM_URL = r'^(?:https?:\/\/)?(?:www\.)?wikidata\.org\/wiki\/(\w+)\/?$' +SONGKICK_ARTIST_URL = r'^(?:https?:\/\/)?(?:www\.)?songkick\.com\/artists\/(\d+)([\w-]*)\/?$' +TUMBLR_PAGE_URL = r'^(?:https?:\/\/)?(?:www\.)?tumblr\.com\/([\w-]+)(?:\/|\/rss)?\/?$' +TUMBLR_PAGE_URL_2 = r'^(?:https?:\/\/)?(\w+)\.tumblr\.com\/?$' +INSTAGRAM_URL = r'^(?:https?:\/\/)?(?:www\.)?instagram\.com\/([\w_-]+)\/?$' +PATREON_URL = r'^(?:https?:\/\/)?(?:www\.)?patreon\.com\/([\w-]+)\/?$' +ARTSTATION_URL = r'^(?:https?:\/\/)?(?:www\.)?artstation\.com\/([\w-]+)\/?$' +INPRNT_URL = r'^(?:https?:\/\/)?(?:www\.)?inprnt\.com\/gallery\/([\w-]+)\/?$' +MAILTO_URL = r'^mailto:([\w._.]+@[\w._.]+)$' +FACEBOOK_PAGE_URL = r'^(?:https?:\/\/)?(?:www\.)?facebook\.com\/([\w-]+)\/?$' + +REGEXES = [ + # Subreddits + (REDDIT_SUBSCRIPTION_URL, SocialSiteId.REDDIT), + + # Twitter + (TWITTER_HANDLE_URL, SocialSiteId.TWITTER), + + # Facebook + (FACEBOOK_PAGE_URL, SocialSiteId.FACEBOOK_PAGE), + + # Linktr.ee + (LINKTREE_PAGE_URL, SocialSiteId.LINKTREE_PAGE), + + # Twitch.tv + (TWITCH_STREAM_URL, SocialSiteId.TWITCH), + + # Wikidata + (WIKIDATA_ITEM_URL, SocialSiteId.WIKIDATA), + + # Songkick + (SONGKICK_ARTIST_URL, SocialSiteId.SONGKICK_ARTIST), + + # Tumblr + (TUMBLR_PAGE_URL, SocialSiteId.TUMBLR), + (TUMBLR_PAGE_URL_2, SocialSiteId.TUMBLR), + + # Instagram + (INSTAGRAM_URL, SocialSiteId.INSTAGRAM_PAGE), + + # Patreon + (PATREON_URL, SocialSiteId.PATREON_PAGE), + + # Artstation + (ARTSTATION_URL, SocialSiteId.ARTSTATION_PAGE), + + # Inprnt + (INPRNT_URL, SocialSiteId.INPRNT_PAGE), + + # Email + (MAILTO_URL, SocialSiteId.EMAIL), +] + +def determine_social_from_url_internally(url): + + # Regexes + for (social_site_url_regex, social_site_id) in REGEXES: + if m := re.match(social_site_url_regex, url): + return (social_site_id, m.group(1)) + + # Mastodon + if 'mastodon' in url: + return (SocialSiteId.MASTODON_PAGE, None) + + # Feed (?) + elif 'feed' in url or 'xml' in url or 'rss' in url or 'atom' in url: + return (SocialSiteId.RSS_FEED, None) + + return (None, None) + +def determine_social_from_url(url): + parsed_url = urllib.parse.urlparse(url) + (social_site_id, social_id) = determine_social_from_url_internally(url) + + if not social_site_id: + return None + return SocialLink(parsed_url, social_site_id, social_id) + +assert determine_social_from_url('http://www.twitter.com/dril').social_id == 'dril' +assert determine_social_from_url('http://worstdril.tumblr.com/')