diff --git a/socials_util/__init__.py b/socials_util/__init__.py index 243fc42..d6fff24 100644 --- a/socials_util/__init__.py +++ b/socials_util/__init__.py @@ -238,6 +238,8 @@ SONGKICK_ARTIST_URL = ( ) TUMBLR_PAGE_URL = re_social_path('tumblr.com') TUMBLR_PAGE_URL_2 = re_social_subdomain('tumblr.com') +TUMBLR_PAGE_URL_3 = re_social_path('tumblr.com/blog') +TUMBLR_PAGE_URL_4 = re_social_path('tumblr.com/blog/view') INSTAGRAM_URL = re_social_path('instagram.com') PATREON_URL = re_social_path_adv('patreon.com', RE_ID, RE_ANY_SUBPATH) ARTSTATION_URL = re_social_path('artstation.com') @@ -299,6 +301,8 @@ REGEXES: list[tuple[str, SocialSiteId]] = [ # Tumblr (TUMBLR_PAGE_URL, SocialSiteId.TUMBLR), (TUMBLR_PAGE_URL_2, SocialSiteId.TUMBLR), + (TUMBLR_PAGE_URL_3, SocialSiteId.TUMBLR), + (TUMBLR_PAGE_URL_4, SocialSiteId.TUMBLR), # Instagram (INSTAGRAM_URL, SocialSiteId.INSTAGRAM_PAGE), # Tiktok @@ -402,6 +406,7 @@ WELL_KNOWN_MASTODON_INSTANCES: frozenset[str] = frozenset( }, ) +DISALLOWED_IDENTIFIERS: frozenset[str] = frozenset({'www'}) def determine_social_from_url_internally( url: str, @@ -414,7 +419,11 @@ def determine_social_from_url_internally( for social_site_url_regex, social_site_id in REGEXES: if m := re.fullmatch(social_site_url_regex, url, re.I): groups = m.groups() - return (social_site_id, groups[0] if len(groups) > 0 else None) + username_or_id = groups[0] if len(groups) > 0 else None + if username_or_id in DISALLOWED_IDENTIFIERS: + continue + return (social_site_id, username_or_id) + del social_site_url_regex, social_site_id, m # Mastodon for mastodon_hostname in WELL_KNOWN_MASTODON_INSTANCES: diff --git a/test/test_parsing.py b/test/test_parsing.py index 349e7ab..b9a575d 100644 --- a/test/test_parsing.py +++ b/test/test_parsing.py @@ -2,6 +2,17 @@ import pytest from socials_util import * +PARSABLE_SOCIAL_IDS_COMBINED = [ + # Tumblr formats + ('https://triviallytrue.tumblr.com/', SocialSiteId.TUMBLR, 'triviallytrue'), + ('https://tumblr.com/triviallytrue', SocialSiteId.TUMBLR, 'triviallytrue'), + ('https://tumblr.com/blog/triviallytrue', SocialSiteId.TUMBLR, 'triviallytrue'), + ('https://tumblr.com/blog/view/triviallytrue', SocialSiteId.TUMBLR, 'triviallytrue'), + ('https://www.tumblr.com/triviallytrue', SocialSiteId.TUMBLR, 'triviallytrue'), + ('https://www.tumblr.com/blog/triviallytrue', SocialSiteId.TUMBLR, 'triviallytrue'), + ('https://www.tumblr.com/blog/view/triviallytrue', SocialSiteId.TUMBLR, 'triviallytrue'), +] + PARSABLE_SOCIAL_IDS = [ ('http://www.twitter.com/dril', 'dril'), ('http://worstdril.tumblr.com/', 'worstdril'), @@ -13,7 +24,7 @@ PARSABLE_SOCIAL_IDS = [ ('https://www.tiktok.com/@depthsofwikipedia?lang=en', 'depthsofwikipedia'), ('https://www.pixiv.net/users/14866303', '14866303'), ('https://www.pixiv.net/member.php?id=109710', '109710'), -] +] + [(a,c) for (a,b,c) in PARSABLE_SOCIAL_IDS_COMBINED] PARSABLE_SOCIAL_SITE_IDS = [ ('https://www.deviantart.com/solquiet', SocialSiteId.DEVIANT_ART_ACCOUNT), @@ -25,16 +36,15 @@ PARSABLE_SOCIAL_SITE_IDS = [ ('https://www.instagram.com/timkongart/', SocialSiteId.INSTAGRAM_PAGE), ('https://www.instagram.com/kcn.wu/', SocialSiteId.INSTAGRAM_PAGE), ('https://www.instagram.com/itsbettyjiang', SocialSiteId.INSTAGRAM_PAGE), -] - +] + [(a,b) for (a,b,c) in PARSABLE_SOCIAL_IDS_COMBINED] @pytest.mark.parametrize('url,expected_social_id', PARSABLE_SOCIAL_IDS) def test_parse_social_ids(url, expected_social_id): social_link = determine_social_from_url(url) - assert social_link.social_id == expected_social_id - assert social_link.social_site_id is not None + assert social_link.social_id == expected_social_id, url + assert social_link.social_site_id is not None, url @pytest.mark.parametrize('url,expected_social_site_id', PARSABLE_SOCIAL_SITE_IDS) def test_parse_social_site_ids(url, expected_social_site_id): - assert determine_social_from_url(url).social_site_id == expected_social_site_id + assert determine_social_from_url(url).social_site_id == expected_social_site_id, url