diff --git a/socials_util/__init__.py b/socials_util/__init__.py index c15b7ac..7488fe9 100644 --- a/socials_util/__init__.py +++ b/socials_util/__init__.py @@ -188,12 +188,15 @@ WIKIDATA_PROPERTIES: dict[SocialSiteId | int, WikidataInfo] = { def re_social_subdomain(main_domain: str) -> str: - return r'^(?:https?:\/\/)?([\w_-]+)\.' + re.escape(main_domain) + r'(\/.*)?$' + return r'^(?:https?:\/\/)?(?:www\.)?([\w_-]+)\.' + re.escape(main_domain) + r'(\/.*)?$' -RE_ID = r'@?([^/]+)' -RE_DUAL_ID = r'@?([^/]+/[^/]+)' -RE_ANY_SUBPATH = r'(|\/|\/.*)$' +RE_ID = r'@?([^\s/]+)' +RE_DUAL_ID = r'@?([^\s/]+/[^\s/]+)' +RE_ANY_SUBPATH = r'(|\/|\/\S*)$' + +SPECIAL_REGEX_LITERALS = frozenset({RE_ID, RE_DUAL_ID, RE_ANY_SUBPATH}) +DOES_NOT_NEED_AUTO_SLASH = frozenset({RE_ANY_SUBPATH}) def re_social_path(main_domain: str) -> str: @@ -212,12 +215,13 @@ def re_social_path_adv(main_domain: str, *path: str) -> str: ] for p in path: - if p != RE_ANY_SUBPATH: + if p not in DOES_NOT_NEED_AUTO_SLASH: regex_builder.append(r'\/') regex_builder.append( - p if p in {RE_ID, RE_DUAL_ID, RE_ANY_SUBPATH} else re.escape(p), + p if p in SPECIAL_REGEX_LITERALS else re.escape(p), ) - if path[-1] != RE_ANY_SUBPATH: + del p + if path[-1] not in DOES_NOT_NEED_AUTO_SLASH: regex_builder.append(r'\/?$') return ''.join(regex_builder) @@ -413,8 +417,7 @@ WELL_KNOWN_MASTODON_INSTANCES: frozenset[str] = frozenset( }, ) -DISALLOWED_IDENTIFIERS: frozenset[str] = frozenset({'www'}) - +DISALLOWED_IDENTIFIERS: frozenset[str] = frozenset({'www', 'intent', 'user'}) def determine_social_from_url_internally( url: str, diff --git a/test/test_parsing.py b/test/test_parsing.py index 6cd3f74..076ec96 100644 --- a/test/test_parsing.py +++ b/test/test_parsing.py @@ -1,10 +1,11 @@ import pytest -from socials_util import * +from socials_util import determine_social_from_url, SocialSiteId, SocialLink PARSABLE_SOCIAL_IDS_COMBINED: list[tuple[str, object, str]] = [ # Tumblr formats ('https://triviallytrue.tumblr.com/', SocialSiteId.TUMBLR, 'triviallytrue'), + ('https://www.triviallytrue.tumblr.com/', SocialSiteId.TUMBLR, 'triviallytrue'), ('https://tumblr.com/triviallytrue', SocialSiteId.TUMBLR, 'triviallytrue'), ('https://tumblr.com/blog/triviallytrue', SocialSiteId.TUMBLR, 'triviallytrue'), ( @@ -179,6 +180,13 @@ PARSABLE_SOCIAL_IDS_COMBINED: list[tuple[str, object, str]] = [ ('https://solquiet.deviantart.com/', SocialSiteId.DEVIANT_ART_ACCOUNT, 'solquiet'), ] +NOT_PARSABLE = [ + # Twitter intents are not supported + 'twitter.com/intent/user?user_id=123', + 'https://twitter.com/intent/user?user_id=123', + 'https://twitter.com/intent/user', + 'https://twitter.com/intent', +] @pytest.mark.parametrize( 'url,expected_social_site_id,expected_social_id', PARSABLE_SOCIAL_IDS_COMBINED @@ -190,3 +198,7 @@ def test_parse_social_ids(url, expected_social_site_id, expected_social_id): expected_social_id, expected_social_site_id, ), url + +@pytest.mark.parametrize('url', NOT_PARSABLE) +def test_not_parsable(url: str): + assert determine_social_from_url(url) is None