From 46683720365a735feed9089e1b264ef53daec247 Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Sat, 9 Nov 2024 17:53:48 +0100 Subject: [PATCH] Normalize URL --- socials_util/__init__.py | 340 ++++++++++++++++++++------------------- test/test_parsing.py | 5 + 2 files changed, 183 insertions(+), 162 deletions(-) diff --git a/socials_util/__init__.py b/socials_util/__init__.py index 593bfcc..34653b1 100644 --- a/socials_util/__init__.py +++ b/socials_util/__init__.py @@ -7,10 +7,10 @@ Used by one-page-internet. import re import urllib.parse -from dataclasses import dataclass +import dataclasses import aenum -from enforce_typing import enforce_types +import enforce_typing from socials_util._version import __version__ @@ -113,16 +113,16 @@ AGGERAGOR_SOCIALS = { } -@enforce_types -@dataclass(frozen=True) +@enforce_typing.enforce_types +@dataclasses.dataclass(frozen=True) class SocialLink: url: urllib.parse.ParseResult social_site_id: SocialSiteId social_id: str | None -@enforce_types -@dataclass(frozen=True) +@enforce_typing.enforce_types +@dataclasses.dataclass(frozen=True) class WikidataInfo: property_id: int | None issuer_id: int | None @@ -205,25 +205,22 @@ WIKIDATA_PROPERTIES: dict[SocialSiteId | int, WikidataInfo] = { } -def re_social_subdomain(main_domain: str) -> str: - return ( - r'^(?:https?:\/\/)?(?:www\.)?([\w_-]+)\.' + re.escape(main_domain) + r'(\/.*)?$' - ) - RE_ID = r'@?([^\s/]+)' RE_DUAL_ID = r'@?([^\s/]+/[^\s/]+)' RE_ANY_SUBPATH = r'(|\/|\/\S*)$' SPECIAL_REGEX_LITERALS = frozenset({RE_ID, RE_DUAL_ID, RE_ANY_SUBPATH}) +REGEX_LITERALS_TO_FORMATTER = {RE_ID: '{id}', RE_DUAL_ID: '{id}', RE_ANY_SUBPATH: ''} DOES_NOT_NEED_AUTO_SLASH = frozenset({RE_ANY_SUBPATH}) +@enforce_typing.enforce_types +@dataclasses.dataclass(frozen=True) +class SocialPathFormat: + regex: str + formatter: str -def re_social_path(main_domain: str) -> str: - return re_social_path_adv(main_domain, RE_ID) - - -def re_social_path_adv(main_domain: str, *path: str) -> str: +def social_path_format_adv(main_domain: str, *path: str) -> SocialPathFormat: if main_domain.startswith('www.'): msg = f'Redundant www: {main_domain}' raise ValueError(msg) @@ -233,17 +230,43 @@ def re_social_path_adv(main_domain: str, *path: str) -> str: r'(?:www\.|m\.|mobile\.)?', re.escape(main_domain), ] + formatter_builder = ['https://', main_domain] for p in path: if p not in DOES_NOT_NEED_AUTO_SLASH: regex_builder.append(r'\/') + formatter_builder.append('/') regex_builder.append( p if p in SPECIAL_REGEX_LITERALS else re.escape(p), ) + formatter_builder.append( + REGEX_LITERALS_TO_FORMATTER.get(p, p), + ) del p if path[-1] not in DOES_NOT_NEED_AUTO_SLASH: regex_builder.append(r'\/?$') - return ''.join(regex_builder) + formatter_builder.append('/') + + return SocialPathFormat( + regex = ''.join(regex_builder), + formatter = ''.join(formatter_builder), + ) + +def social_path_format(main_domain: str) -> SocialPathFormat: + return social_path_format_adv(main_domain, RE_ID) + +def re_social_subdomain(main_domain: str) -> SocialPathFormat: + return SocialPathFormat( + regex = r'^(?:https?:\/\/)?(?:www\.)?([\w_-]+)\.' + re.escape(main_domain) + r'(\/.*)?$', + formatter = 'https://{id}.' + main_domain, + ) + +def re_social_path(main_domain: str) -> str: + return social_path_format(main_domain).regex + + +def re_social_path_adv(main_domain: str, *path: str) -> str: + return social_path_format_adv(main_domain, *path).regex MAILTO_URL = r'^mailto:(?:[\w._.]+@[\w._.]+)$' @@ -252,171 +275,154 @@ REDDIT_SUBREDDIT_URL = r'^(?:https?:\/\/)?(?:old\.)?reddit\.com\/r\/([\w-]+)\/?$ REDDIT_USER_URL = ( r'^(?:https?:\/\/)?(?:old\.|www\.)?reddit\.com\/user\/([\w-]+)(?:|\/submitted)\/?$' ) -TWITTER_HANDLE_URL_1 = re_social_path_adv('twitter.com', RE_ID, RE_ANY_SUBPATH) -TWITTER_HANDLE_URL_2 = re_social_path_adv('x.com', RE_ID, RE_ANY_SUBPATH) -LINKTREE_PAGE_URL = re_social_path('linktr.ee') -TWITCH_STREAM_URL = re_social_path('twitch.tv') -WIKIDATA_ITEM_URL = re_social_path_adv('wikidata.org', 'wiki', RE_ID) + SONGKICK_ARTIST_URL = ( r'^(?:https?:\/\/)?(?:www\.)?songkick\.com\/artists\/(\d+)([\w-]*)\/?$' ) -TUMBLR_PAGE_URL = re_social_path('tumblr.com') -TUMBLR_PAGE_URL_2 = re_social_subdomain('tumblr.com') -TUMBLR_PAGE_URL_3 = re_social_path('tumblr.com/blog') -TUMBLR_PAGE_URL_4 = re_social_path('tumblr.com/blog/view') -INSTAGRAM_URL = re_social_path('instagram.com') -PATREON_URL = re_social_path_adv('patreon.com', RE_ID, RE_ANY_SUBPATH) -ARTSTATION_URL_1 = re_social_path_adv('artstation.com', RE_ID, RE_ANY_SUBPATH) -ARTSTATION_URL_2 = re_social_subdomain('artstation.com') -INPRNT_URL = re_social_path_adv('inprnt.com', 'gallery', RE_ID) -FACEBOOK_PAGE_URL = re_social_path('facebook.com') -SUBSTACK_PREFIX_URL = re_social_subdomain('substack.com') -ETSY_SHOP_URL = re_social_path_adv('etsy.com', 'shop', RE_ID) -BEHANCE_PAGE_URL = re_social_path('behance.net') -TIKTOK_USER_URL = re_social_path('tiktok.com') + PIXIV_USER_ID_URL = r'^(?:https?:\/\/)?(?:www\.)?pixiv\.net(?:\/en)?\/users/(\d+)\/?$' PIXIV_USER_ID_URL_2 = ( r'^(?:https?:\/\/)?(?:www\.)?pixiv\.net(?:\/en)?\/member\.php\/?[?]id=(\d+)$' ) -PIXIV_FANBOX_USER_NICKNAME_URL = re_social_subdomain('fanbox.cc') -PIXIV_USER_NICKNAME_URL = re_social_path_adv('pixiv.net', 'stacc', RE_ID) -PIXIV_SKETCH_USER_NICKNAME_URL = re_social_path_adv('sketch.pixiv.net', RE_ID) -URL_PARSE_CARRD_PAGE = re_social_subdomain('carrd.co') -URL_PARSE_YOUTUBE_CHANNEL_HANDLE_1 = re_social_path_adv( - 'youtube.com', - RE_ID, - RE_ANY_SUBPATH, -) -URL_PARSE_YOUTUBE_CHANNEL_HANDLE_2 = re_social_path_adv('youtube.com', 'c', RE_ID) -URL_PARSE_YOUTUBE_CHANNEL_ID = re_social_path_adv('youtube.com', 'channel', RE_ID) -URL_PARSE_VIMEO_CHANNEL = re_social_path_adv('vimeo.com', RE_ID) -URL_PARSE_NEWGROUNDS_PAGE = re_social_subdomain('newgrounds.com') -URL_PARSE_ARTSY_ARTIST = re_social_path_adv( - 'artsy.net', - 'artist', - RE_ID, - RE_ANY_SUBPATH, -) -URL_PARSE_ARTNET_ARTIST = re_social_path_adv( - 'artnet.com', - 'artists', - RE_ID, - RE_ANY_SUBPATH, -) -URL_PARSE_DEVIANT_ART_ACCOUNT = re_social_path_adv('deviantart.com', RE_ID) -URL_PARSE_DEVIANT_ART_ACCOUNT_2 = re_social_subdomain('deviantart.com') -URL_PARSE_DANBOORU_ARTIST = re_social_path_adv('danbooru.donmai.us', 'artists', RE_ID) -URL_PARSE_BANDCAMP = re_social_subdomain('bandcamp.com') -URL_PARSE_BLUESKY = re_social_path_adv('bsky.app', 'profile', RE_ID) +URL_FORMATS: list[tuple[object, SocialPathFormat]] = [ + + # Twitter + (SocialSiteId.TWITTER, social_path_format_adv('x.com', RE_ID, RE_ANY_SUBPATH)), + (SocialSiteId.TWITTER, social_path_format_adv('twitter.com', RE_ID, RE_ANY_SUBPATH)), + + # Linktr.ee + (SocialSiteId.LINKTREE_PAGE, social_path_format('linktr.ee')), + + # Twitch.tv + (SocialSiteId.TWITCH, social_path_format('twitch.tv')), + + # Wikidata + (SocialSiteId.WIKIDATA, social_path_format_adv('wikidata.org', 'wiki', RE_ID)), + + # Tumblr + (SocialSiteId.TUMBLR, social_path_format('tumblr.com')), + (SocialSiteId.TUMBLR, re_social_subdomain('tumblr.com')), + (SocialSiteId.TUMBLR, social_path_format('tumblr.com/blog')), + (SocialSiteId.TUMBLR, social_path_format('tumblr.com/blog/view')), + + # Instagram + (SocialSiteId.INSTAGRAM_PAGE, social_path_format('instagram.com')), + + # Patreon + (SocialSiteId.PATREON_PAGE, social_path_format_adv('patreon.com', RE_ID, RE_ANY_SUBPATH)), + + # Artstation + (SocialSiteId.ARTSTATION_PAGE, social_path_format_adv('artstation.com', RE_ID, RE_ANY_SUBPATH)), + (SocialSiteId.ARTSTATION_PAGE, re_social_subdomain('artstation.com')), + + # Inprnt + (SocialSiteId.INPRNT_PAGE, social_path_format_adv('inprnt.com', 'gallery', RE_ID)), + + # Facebook + (SocialSiteId.FACEBOOK_PAGE, social_path_format('facebook.com')), + + # Substack + (SocialSiteId.SUBSTACK, re_social_subdomain('substack.com')), + + # Etsy shop + (SocialSiteId.ETSY_SHOP, social_path_format_adv('etsy.com', 'shop', RE_ID)), + + # Behance + (SocialSiteId.BEHANCE_PAGE, social_path_format('behance.net')), + + # Tiktok + (SocialSiteId.TIKTOK_USER, social_path_format('tiktok.com')), + + # Pixiv + (SocialSiteId.PIXIV_USER_NICKNAME, social_path_format_adv('pixiv.net', 'stacc', RE_ID)), + (SocialSiteId.PIXIV_USER_NICKNAME, re_social_subdomain('fanbox.cc')), + (SocialSiteId.PIXIV_USER_NICKNAME, social_path_format_adv('sketch.pixiv.net', RE_ID)), + + # Carrd + (SocialSiteId.CARRD_PAGE, re_social_subdomain('carrd.co')), + + # Youtube + (SocialSiteId.YOUTUBE_CHANNEL_HANDLE, social_path_format_adv('youtube.com', RE_ID, RE_ANY_SUBPATH)), + (SocialSiteId.YOUTUBE_CHANNEL_HANDLE, social_path_format_adv('youtube.com', 'c', RE_ID)), + (SocialSiteId.YOUTUBE_CHANNEL_ID, social_path_format_adv('youtube.com', 'channel', RE_ID)), + + # Vimeo + (SocialSiteId.VIMEO_CHANNEL, social_path_format_adv('vimeo.com', RE_ID)), + + # Newgrounds + (SocialSiteId.NEWGROUNDS_PAGE, re_social_subdomain('newgrounds.com')), + + # Artsy + (SocialSiteId.ARTSY_ARTIST, social_path_format_adv( 'artsy.net', 'artist', RE_ID, RE_ANY_SUBPATH)), + + # Artnet + (SocialSiteId.ARTNET_ARTIST, social_path_format_adv( 'artnet.com', 'artists', RE_ID, RE_ANY_SUBPATH)), + + # Deviant art + (SocialSiteId.DEVIANT_ART_ACCOUNT, social_path_format_adv('deviantart.com', RE_ID)), + (SocialSiteId.DEVIANT_ART_ACCOUNT, re_social_subdomain('deviantart.com')), + + # Danbooru + (SocialSiteId.DANBOORU_ARTIST, social_path_format_adv('danbooru.donmai.us', 'artists', RE_ID)), + + # Bandcamp + (SocialSiteId.BANDCAMP_PROFILE, re_social_subdomain('bandcamp.com')), + + # Bluesky + (SocialSiteId.BLUESKY_PROFILE, social_path_format_adv('bsky.app', 'profile', RE_ID)), + + # Medium + (SocialSiteId.MEDIUM_BLOG, social_path_format_adv('medium.com', RE_ID), ), + (SocialSiteId.MEDIUM_BLOG, re_social_subdomain('medium.com'), ), + # Ko-fi + (SocialSiteId.KO_FI, social_path_format_adv('ko-fi.com', RE_ID), ), + (SocialSiteId.KO_FI, social_path_format_adv('ko-fi.com', RE_ID, 'shop'), ), + # Threads + (SocialSiteId.THREADS_USERNAME, social_path_format_adv('threads.net', RE_ID), ), + # Itch.io + (SocialSiteId.ITCH_IO_DEVELOPER, re_social_subdomain('itch.io'), ), + # Cohost + (SocialSiteId.COHOST_PROFILE, social_path_format_adv('cohost.org', RE_ID), ), + # Soundcloud + (SocialSiteId.SOUNDCLOUD_ARTIST, social_path_format_adv('soundcloud.com', RE_ID), ), + # IGDB + (SocialSiteId.IGDB_GAME_ID, social_path_format_adv('igdb.com', 'games', RE_ID), ), + # Steam game + ( + SocialSiteId.STEAM_APPLICATION_ID, + social_path_format_adv('store.steampowered.com', 'app', RE_ID, RE_ANY_SUBPATH), + ), + # Github + ( + SocialSiteId.GITHUB_REPOSITORY, + social_path_format_adv('github.com', RE_DUAL_ID, RE_ANY_SUBPATH), + ), + # Plurk + (SocialSiteId.PLURK, social_path_format_adv('plurk.com', RE_ID), ), + # Linked in + ( + SocialSiteId.LINKEDIN_PERSONAL_PROFILE, + social_path_format_adv('linkedin.com', 'in', RE_ID), + ), + # Google Blogger + (SocialSiteId.GOOGLE_BLOGGER_PAGE, re_social_subdomain('blogspot.com'), ), + # Cara + (SocialSiteId.CARA_PROFILE, social_path_format_adv('cara.app', RE_ID, RE_ANY_SUBPATH), ), +] REGEXES: list[tuple[str, object]] = [ # Reddit (REDDIT_SUBREDDIT_URL, SocialSiteId.REDDIT_SUBREDDIT), (REDDIT_USER_URL, SocialSiteId.REDDIT_USER), - # Twitter - (TWITTER_HANDLE_URL_1, SocialSiteId.TWITTER), - (TWITTER_HANDLE_URL_2, SocialSiteId.TWITTER), - # Facebook - (FACEBOOK_PAGE_URL, SocialSiteId.FACEBOOK_PAGE), - # Linktr.ee - (LINKTREE_PAGE_URL, SocialSiteId.LINKTREE_PAGE), - # Twitch.tv - (TWITCH_STREAM_URL, SocialSiteId.TWITCH), - # Wikidata - (WIKIDATA_ITEM_URL, SocialSiteId.WIKIDATA), # Songkick (SONGKICK_ARTIST_URL, SocialSiteId.SONGKICK_ARTIST), - # Tumblr - (TUMBLR_PAGE_URL, SocialSiteId.TUMBLR), - (TUMBLR_PAGE_URL_2, SocialSiteId.TUMBLR), - (TUMBLR_PAGE_URL_3, SocialSiteId.TUMBLR), - (TUMBLR_PAGE_URL_4, SocialSiteId.TUMBLR), - # Instagram - (INSTAGRAM_URL, SocialSiteId.INSTAGRAM_PAGE), - # Tiktok - (TIKTOK_USER_URL, SocialSiteId.TIKTOK_USER), # Pixiv (PIXIV_USER_ID_URL, SocialSiteId.PIXIV_USER_ID), (PIXIV_USER_ID_URL_2, SocialSiteId.PIXIV_USER_ID), - (PIXIV_FANBOX_USER_NICKNAME_URL, SocialSiteId.PIXIV_USER_NICKNAME), - (PIXIV_USER_NICKNAME_URL, SocialSiteId.PIXIV_USER_NICKNAME), - (PIXIV_SKETCH_USER_NICKNAME_URL, SocialSiteId.PIXIV_USER_NICKNAME), - # Patreon - (PATREON_URL, SocialSiteId.PATREON_PAGE), - # Artstation - (ARTSTATION_URL_1, SocialSiteId.ARTSTATION_PAGE), - (ARTSTATION_URL_2, SocialSiteId.ARTSTATION_PAGE), - # Inprnt - (INPRNT_URL, SocialSiteId.INPRNT_PAGE), # Email (MAILTO_URL, SocialSiteId.EMAIL), - # Substack - (SUBSTACK_PREFIX_URL, SocialSiteId.SUBSTACK), - # Medium - (re_social_path_adv('medium.com', RE_ID), SocialSiteId.MEDIUM_BLOG), - (re_social_subdomain('medium.com'), SocialSiteId.MEDIUM_BLOG), - # Etsy shop - (ETSY_SHOP_URL, SocialSiteId.ETSY_SHOP), - # Ko-fi - (re_social_path_adv('ko-fi.com', RE_ID), SocialSiteId.KO_FI), - (re_social_path_adv('ko-fi.com', RE_ID, 'shop'), SocialSiteId.KO_FI), - # Behance - (BEHANCE_PAGE_URL, SocialSiteId.BEHANCE_PAGE), - # Carrd - (URL_PARSE_CARRD_PAGE, SocialSiteId.CARRD_PAGE), - # Youtube - (URL_PARSE_YOUTUBE_CHANNEL_HANDLE_1, SocialSiteId.YOUTUBE_CHANNEL_HANDLE), - (URL_PARSE_YOUTUBE_CHANNEL_HANDLE_2, SocialSiteId.YOUTUBE_CHANNEL_HANDLE), - (URL_PARSE_YOUTUBE_CHANNEL_ID, SocialSiteId.YOUTUBE_CHANNEL_ID), - # Vimeo - (URL_PARSE_VIMEO_CHANNEL, SocialSiteId.VIMEO_CHANNEL), - # Newgrounds - (URL_PARSE_NEWGROUNDS_PAGE, SocialSiteId.NEWGROUNDS_PAGE), - # Artsy - (URL_PARSE_ARTSY_ARTIST, SocialSiteId.ARTSY_ARTIST), - (URL_PARSE_ARTNET_ARTIST, SocialSiteId.ARTNET_ARTIST), - # Deviant art - (URL_PARSE_DEVIANT_ART_ACCOUNT, SocialSiteId.DEVIANT_ART_ACCOUNT), - (URL_PARSE_DEVIANT_ART_ACCOUNT_2, SocialSiteId.DEVIANT_ART_ACCOUNT), - # Danbooru - (URL_PARSE_DANBOORU_ARTIST, SocialSiteId.DANBOORU_ARTIST), - # Bandcamp - (URL_PARSE_BANDCAMP, SocialSiteId.BANDCAMP_PROFILE), - # Bluesky - (URL_PARSE_BLUESKY, SocialSiteId.BLUESKY_PROFILE), - # Threads - (re_social_path_adv('threads.net', RE_ID), SocialSiteId.THREADS_USERNAME), - # Itch.io - (re_social_subdomain('itch.io'), SocialSiteId.ITCH_IO_DEVELOPER), - # Cohost - (re_social_path_adv('cohost.org', RE_ID), SocialSiteId.COHOST_PROFILE), - # Soundcloud - (re_social_path_adv('soundcloud.com', RE_ID), SocialSiteId.SOUNDCLOUD_ARTIST), - # IGDB - (re_social_path_adv('igdb.com', 'games', RE_ID), SocialSiteId.IGDB_GAME_ID), - # Steam game - ( - re_social_path_adv('store.steampowered.com', 'app', RE_ID, RE_ANY_SUBPATH), - SocialSiteId.STEAM_APPLICATION_ID, - ), - # Github - ( - re_social_path_adv('github.com', RE_DUAL_ID, RE_ANY_SUBPATH), - SocialSiteId.GITHUB_REPOSITORY, - ), - # Plurk - (re_social_path_adv('plurk.com', RE_ID), SocialSiteId.PLURK), - # Linked in - ( - re_social_path_adv('linkedin.com', 'in', RE_ID), - SocialSiteId.LINKEDIN_PERSONAL_PROFILE, - ), - # Google Blogger - (re_social_subdomain('blogspot.com'), SocialSiteId.GOOGLE_BLOGGER_PAGE), - # Cara - (re_social_path_adv('cara.app', RE_ID, RE_ANY_SUBPATH), SocialSiteId.CARA_PROFILE), -] +] + [(fmt.regex, social_site_id) for (social_site_id, fmt) in URL_FORMATS] WELL_KNOWN_MASTODON_INSTANCES: frozenset[str] = frozenset( { @@ -485,6 +491,11 @@ def to_parse_result(url: str | urllib.parse.ParseResult) -> urllib.parse.ParseRe msg = f'Expected {urllib.parse.ParseResult} or {str}' raise TypeError(msg) +def to_url(social_site_id: SocialSiteId, social_id: str) -> urllib.parse.ParseResult | None: + for (ssi, fmt) in URL_FORMATS: + if ssi == social_site_id: + return to_parse_result(fmt.formatter.format(id=social_id)) + return None def determine_social_from_url( url_not_normalized: str | urllib.parse.ParseResult, @@ -500,4 +511,9 @@ def determine_social_from_url( if not social_site_id: return None + + # Normalize url if possible + if social_id is not None: + url = to_url(social_site_id, social_id) or url + return SocialLink(url, social_site_id, social_id) diff --git a/test/test_parsing.py b/test/test_parsing.py index 3b2eace..b636015 100644 --- a/test/test_parsing.py +++ b/test/test_parsing.py @@ -242,3 +242,8 @@ def test_from_parse_result() -> None: def test_determine_social_from_url_internally() -> None: with pytest.raises(TypeError): assert socials_util.determine_social_from_url_internally(None) + +def test_normalize_url(): + social_link = determine_social_from_url('http://twitter.com/dril') + assert social_link is not None + assert social_link.url.geturl() == 'https://x.com/dril'