1
0

More typing

This commit is contained in:
Jon Michael Aanes 2024-05-12 16:34:47 +02:00
parent f73ba5ccc2
commit 1aa41a8414
Signed by: Jmaa
SSH Key Fingerprint: SHA256:Ab0GfHGCblESJx7JRE4fj4bFy/KRpeLhi41y4pF3sNA

View File

@ -118,7 +118,7 @@ class WikidataInfo(object):
nickname_version_of: Optional[SocialSiteId] = None nickname_version_of: Optional[SocialSiteId] = None
WIKIDATA_PROPERTIES = { WIKIDATA_PROPERTIES: dict[SocialSiteId, WikidataInfo] = {
SocialSiteId.EMAIL: WikidataInfo(968, None), SocialSiteId.EMAIL: WikidataInfo(968, None),
SocialSiteId.RSS_FEED: WikidataInfo(1079, None), SocialSiteId.RSS_FEED: WikidataInfo(1079, None),
SocialSiteId.FACEBOOK_PAGE: WikidataInfo(2013, None), SocialSiteId.FACEBOOK_PAGE: WikidataInfo(2013, None),
@ -184,8 +184,7 @@ WIKIDATA_PROPERTIES = {
} }
def re_social_subdomain(main_domain): def re_social_subdomain(main_domain: str) -> str:
# return r'^(?:https?:\/\/)?([\w_-]+)\.'+re.escape(main_domain)+'\/?$'
return r'^(?:https?:\/\/)?([\w_-]+)\.' + re.escape(main_domain) + r'(\/.*)?$' return r'^(?:https?:\/\/)?([\w_-]+)\.' + re.escape(main_domain) + r'(\/.*)?$'
@ -194,23 +193,23 @@ RE_DUAL_ID = r'@?([^/]+/[^/]+)'
RE_ANY_SUBPATH = r'(|\/|\/.*)$' RE_ANY_SUBPATH = r'(|\/|\/.*)$'
def re_social_path(main_domain): def re_social_path(main_domain: str) -> str:
# return r'^(?:https?:\/\/)?(?:www\.)?'+re.escape(main_domain)+'\/'+RE_ID+'\/?$'
return re_social_path_adv(main_domain, RE_ID) return re_social_path_adv(main_domain, RE_ID)
def re_social_path_adv(main_domain, *path): def re_social_path_adv(main_domain: str, *path: str) -> str:
assert not main_domain.startswith('www.'), 'Redundant www.' if main_domain.startswith('www.'):
l = [r'^', r'(?:https?:\/\/)?', r'(?:www\.)?', re.escape(main_domain)] msg = f'Redundant www: {main_domain}'
raise ValueError(msg)
regex_builder: list[str] = [r'^', r'(?:https?:\/\/)?', r'(?:www\.)?', re.escape(main_domain)]
for p in path: for p in path:
if p != RE_ANY_SUBPATH: if p != RE_ANY_SUBPATH:
l.append(r'\/') regex_builder.append(r'\/')
l.append(p if p in {RE_ID, RE_DUAL_ID, RE_ANY_SUBPATH} else re.escape(p)) regex_builder.append(p if p in {RE_ID, RE_DUAL_ID, RE_ANY_SUBPATH} else re.escape(p))
if path[-1] != RE_ANY_SUBPATH: if path[-1] != RE_ANY_SUBPATH:
l.append(r'\/?$') regex_builder.append(r'\/?$')
regex = ''.join(l) return ''.join(regex_builder)
return regex
MAILTO_URL = r'^mailto:(?:[\w._.]+@[\w._.]+)$' MAILTO_URL = r'^mailto:(?:[\w._.]+@[\w._.]+)$'
@ -264,7 +263,7 @@ URL_PARSE_DANBOORU_ARTIST = re_social_path_adv('danbooru.donmai.us', 'artists',
URL_PARSE_BANDCAMP = re_social_subdomain('bandcamp.com') URL_PARSE_BANDCAMP = re_social_subdomain('bandcamp.com')
URL_PARSE_BLUESKY = re_social_path_adv('bsky.app', 'profile', RE_ID) URL_PARSE_BLUESKY = re_social_path_adv('bsky.app', 'profile', RE_ID)
REGEXES = [ REGEXES: list[tuple[str, SocialSiteId]] = [
# Reddit # Reddit
(REDDIT_SUBREDDIT_URL, SocialSiteId.REDDIT_SUBREDDIT), (REDDIT_SUBREDDIT_URL, SocialSiteId.REDDIT_SUBREDDIT),
(REDDIT_USER_URL, SocialSiteId.REDDIT_USER), (REDDIT_USER_URL, SocialSiteId.REDDIT_USER),
@ -364,7 +363,7 @@ REGEXES = [
(re_social_subdomain('blogspot.com'), SocialSiteId.GOOGLE_BLOGGER_PAGE), (re_social_subdomain('blogspot.com'), SocialSiteId.GOOGLE_BLOGGER_PAGE),
] ]
WELL_KNOWN_MASTODON_INSTANCES = frozenset( WELL_KNOWN_MASTODON_INSTANCES: frozenset[str] = frozenset(
{ {
# Includes all servers with 50 000+ users as of 6 july 2023. # Includes all servers with 50 000+ users as of 6 july 2023.
# based on https://mastodonservers.net/servers/top # based on https://mastodonservers.net/servers/top
@ -388,7 +387,7 @@ WELL_KNOWN_MASTODON_INSTANCES = frozenset(
) )
def determine_social_from_url_internally(url: str): def determine_social_from_url_internally(url: str) -> tuple[SocialSiteId | None, str | None]:
assert isinstance(url, str) assert isinstance(url, str)
# Regexes # Regexes
@ -405,15 +404,23 @@ def determine_social_from_url_internally(url: str):
return (SocialSiteId.MASTODON_PAGE, None) return (SocialSiteId.MASTODON_PAGE, None)
# Feed (?) # Feed (?)
elif 'feed' in url or 'xml' in url or 'rss' in url or 'atom' in url: if 'feed' in url or 'xml' in url or 'rss' in url or 'atom' in url:
return (SocialSiteId.RSS_FEED, None) return (SocialSiteId.RSS_FEED, None)
return (None, None) return (None, None)
def to_parse_result(url: str | urllib.parse.ParseResult) -> urllib.parse.ParseResult:
def determine_social_from_url(url):
if isinstance(url, str): if isinstance(url, str):
url = urllib.parse.urlparse(url) return urllib.parse.urlparse(url)
if isinstance(url, urllib.parse.ParseResult):
return url
# Throw error
msg = f'Expected {urllib.parse.ParseResult} or {str}'
raise TypeError(msg)
def determine_social_from_url(url_not_normalized: str | urllib.parse.ParseResult) -> SocialLink | None:
url = to_parse_result(url_not_normalized)
(social_site_id, social_id) = determine_social_from_url_internally( (social_site_id, social_id) = determine_social_from_url_internally(
url._replace(query='', fragment='').geturl() url._replace(query='', fragment='').geturl()
) )