diff --git a/socials_util/__init__.py b/socials_util/__init__.py index 3b0316b..729c8ae 100644 --- a/socials_util/__init__.py +++ b/socials_util/__init__.py @@ -118,7 +118,7 @@ class WikidataInfo(object): nickname_version_of: Optional[SocialSiteId] = None -WIKIDATA_PROPERTIES = { +WIKIDATA_PROPERTIES: dict[SocialSiteId, WikidataInfo] = { SocialSiteId.EMAIL: WikidataInfo(968, None), SocialSiteId.RSS_FEED: WikidataInfo(1079, None), SocialSiteId.FACEBOOK_PAGE: WikidataInfo(2013, None), @@ -184,8 +184,7 @@ WIKIDATA_PROPERTIES = { } -def re_social_subdomain(main_domain): - # return r'^(?:https?:\/\/)?([\w_-]+)\.'+re.escape(main_domain)+'\/?$' +def re_social_subdomain(main_domain: str) -> str: return r'^(?:https?:\/\/)?([\w_-]+)\.' + re.escape(main_domain) + r'(\/.*)?$' @@ -194,23 +193,23 @@ RE_DUAL_ID = r'@?([^/]+/[^/]+)' RE_ANY_SUBPATH = r'(|\/|\/.*)$' -def re_social_path(main_domain): - # return r'^(?:https?:\/\/)?(?:www\.)?'+re.escape(main_domain)+'\/'+RE_ID+'\/?$' +def re_social_path(main_domain: str) -> str: return re_social_path_adv(main_domain, RE_ID) -def re_social_path_adv(main_domain, *path): - assert not main_domain.startswith('www.'), 'Redundant www.' - l = [r'^', r'(?:https?:\/\/)?', r'(?:www\.)?', re.escape(main_domain)] +def re_social_path_adv(main_domain: str, *path: str) -> str: + if main_domain.startswith('www.'): + msg = f'Redundant www: {main_domain}' + raise ValueError(msg) + regex_builder: list[str] = [r'^', r'(?:https?:\/\/)?', r'(?:www\.)?', re.escape(main_domain)] for p in path: if p != RE_ANY_SUBPATH: - l.append(r'\/') - l.append(p if p in {RE_ID, RE_DUAL_ID, RE_ANY_SUBPATH} else re.escape(p)) + regex_builder.append(r'\/') + regex_builder.append(p if p in {RE_ID, RE_DUAL_ID, RE_ANY_SUBPATH} else re.escape(p)) if path[-1] != RE_ANY_SUBPATH: - l.append(r'\/?$') - regex = ''.join(l) - return regex + regex_builder.append(r'\/?$') + return ''.join(regex_builder) MAILTO_URL = r'^mailto:(?:[\w._.]+@[\w._.]+)$' @@ -264,7 +263,7 @@ URL_PARSE_DANBOORU_ARTIST = re_social_path_adv('danbooru.donmai.us', 'artists', URL_PARSE_BANDCAMP = re_social_subdomain('bandcamp.com') URL_PARSE_BLUESKY = re_social_path_adv('bsky.app', 'profile', RE_ID) -REGEXES = [ +REGEXES: list[tuple[str, SocialSiteId]] = [ # Reddit (REDDIT_SUBREDDIT_URL, SocialSiteId.REDDIT_SUBREDDIT), (REDDIT_USER_URL, SocialSiteId.REDDIT_USER), @@ -364,7 +363,7 @@ REGEXES = [ (re_social_subdomain('blogspot.com'), SocialSiteId.GOOGLE_BLOGGER_PAGE), ] -WELL_KNOWN_MASTODON_INSTANCES = frozenset( +WELL_KNOWN_MASTODON_INSTANCES: frozenset[str] = frozenset( { # Includes all servers with 50 000+ users as of 6 july 2023. # based on https://mastodonservers.net/servers/top @@ -388,7 +387,7 @@ WELL_KNOWN_MASTODON_INSTANCES = frozenset( ) -def determine_social_from_url_internally(url: str): +def determine_social_from_url_internally(url: str) -> tuple[SocialSiteId | None, str | None]: assert isinstance(url, str) # Regexes @@ -405,15 +404,23 @@ def determine_social_from_url_internally(url: str): return (SocialSiteId.MASTODON_PAGE, None) # Feed (?) - elif 'feed' in url or 'xml' in url or 'rss' in url or 'atom' in url: + if 'feed' in url or 'xml' in url or 'rss' in url or 'atom' in url: return (SocialSiteId.RSS_FEED, None) return (None, None) - -def determine_social_from_url(url): +def to_parse_result(url: str | urllib.parse.ParseResult) -> urllib.parse.ParseResult: if isinstance(url, str): - url = urllib.parse.urlparse(url) + return urllib.parse.urlparse(url) + if isinstance(url, urllib.parse.ParseResult): + return url + + # Throw error + msg = f'Expected {urllib.parse.ParseResult} or {str}' + raise TypeError(msg) + +def determine_social_from_url(url_not_normalized: str | urllib.parse.ParseResult) -> SocialLink | None: + url = to_parse_result(url_not_normalized) (social_site_id, social_id) = determine_social_from_url_internally( url._replace(query='', fragment='').geturl() )