1
0

More precise tests, and better support for certain sites.
Some checks failed
Python Package / Python-Test (push) Successful in 20s
Python Package / Python-Package (push) Failing after 21s

This commit is contained in:
Jon Michael Aanes 2024-05-26 11:24:56 +02:00
parent f4a71899e0
commit eead04f620
Signed by: Jmaa
SSH Key Fingerprint: SHA256:Ab0GfHGCblESJx7JRE4fj4bFy/KRpeLhi41y4pF3sNA
2 changed files with 165 additions and 54 deletions

View File

@ -113,7 +113,7 @@ class WikidataInfo:
nickname_version_of: SocialSiteId | None = None nickname_version_of: SocialSiteId | None = None
WIKIDATA_PROPERTIES: dict[SocialSiteId, WikidataInfo] = { WIKIDATA_PROPERTIES: dict[SocialSiteId | int, WikidataInfo] = {
SocialSiteId.EMAIL: WikidataInfo(968, None), SocialSiteId.EMAIL: WikidataInfo(968, None),
SocialSiteId.RSS_FEED: WikidataInfo(1079, None), SocialSiteId.RSS_FEED: WikidataInfo(1079, None),
SocialSiteId.FACEBOOK_PAGE: WikidataInfo(2013, None), SocialSiteId.FACEBOOK_PAGE: WikidataInfo(2013, None),
@ -207,7 +207,7 @@ def re_social_path_adv(main_domain: str, *path: str) -> str:
regex_builder: list[str] = [ regex_builder: list[str] = [
r'^', r'^',
r'(?:https?:\/\/)?', r'(?:https?:\/\/)?',
r'(?:www\.)?', r'(?:www\.|m\.|mobile\.)?',
re.escape(main_domain), re.escape(main_domain),
] ]
@ -228,8 +228,8 @@ REDDIT_SUBREDDIT_URL = r'^(?:https?:\/\/)?(?:old\.)?reddit\.com\/r\/([\w-]+)\/?$
REDDIT_USER_URL = ( REDDIT_USER_URL = (
r'^(?:https?:\/\/)?(?:old\.)?reddit\.com\/user\/([\w-]+)(?:|\/submitted)\/?$' r'^(?:https?:\/\/)?(?:old\.)?reddit\.com\/user\/([\w-]+)(?:|\/submitted)\/?$'
) )
TWITTER_HANDLE_URL_1 = re_social_path('twitter.com') TWITTER_HANDLE_URL_1 = re_social_path_adv('twitter.com', RE_ID, RE_ANY_SUBPATH)
TWITTER_HANDLE_URL_2 = re_social_path('x.com') TWITTER_HANDLE_URL_2 = re_social_path_adv('x.com', RE_ID, RE_ANY_SUBPATH)
LINKTREE_PAGE_URL = re_social_path('linktr.ee') LINKTREE_PAGE_URL = re_social_path('linktr.ee')
TWITCH_STREAM_URL = re_social_path('twitch.tv') TWITCH_STREAM_URL = re_social_path('twitch.tv')
WIKIDATA_ITEM_URL = re_social_path_adv('wikidata.org', 'wiki', RE_ID) WIKIDATA_ITEM_URL = re_social_path_adv('wikidata.org', 'wiki', RE_ID)
@ -242,7 +242,8 @@ TUMBLR_PAGE_URL_3 = re_social_path('tumblr.com/blog')
TUMBLR_PAGE_URL_4 = re_social_path('tumblr.com/blog/view') TUMBLR_PAGE_URL_4 = re_social_path('tumblr.com/blog/view')
INSTAGRAM_URL = re_social_path('instagram.com') INSTAGRAM_URL = re_social_path('instagram.com')
PATREON_URL = re_social_path_adv('patreon.com', RE_ID, RE_ANY_SUBPATH) PATREON_URL = re_social_path_adv('patreon.com', RE_ID, RE_ANY_SUBPATH)
ARTSTATION_URL = re_social_path('artstation.com') ARTSTATION_URL_1 = re_social_path_adv('artstation.com', RE_ID, RE_ANY_SUBPATH)
ARTSTATION_URL_2 = re_social_subdomain('artstation.com')
INPRNT_URL = re_social_path_adv('inprnt.com', 'gallery', RE_ID) INPRNT_URL = re_social_path_adv('inprnt.com', 'gallery', RE_ID)
FACEBOOK_PAGE_URL = re_social_path('facebook.com') FACEBOOK_PAGE_URL = re_social_path('facebook.com')
SUBSTACK_PREFIX_URL = re_social_subdomain('substack.com') SUBSTACK_PREFIX_URL = re_social_subdomain('substack.com')
@ -258,7 +259,9 @@ PIXIV_USER_NICKNAME_URL = re_social_path_adv('pixiv.net', 'stacc', RE_ID)
PIXIV_SKETCH_USER_NICKNAME_URL = re_social_path_adv('sketch.pixiv.net', RE_ID) PIXIV_SKETCH_USER_NICKNAME_URL = re_social_path_adv('sketch.pixiv.net', RE_ID)
URL_PARSE_CARRD_PAGE = re_social_subdomain('carrd.co') URL_PARSE_CARRD_PAGE = re_social_subdomain('carrd.co')
URL_PARSE_YOUTUBE_CHANNEL_HANDLE_1 = re_social_path_adv('youtube.com', RE_ID) URL_PARSE_YOUTUBE_CHANNEL_HANDLE_1 = re_social_path_adv(
'youtube.com', RE_ID, RE_ANY_SUBPATH
)
URL_PARSE_YOUTUBE_CHANNEL_HANDLE_2 = re_social_path_adv('youtube.com', 'c', RE_ID) URL_PARSE_YOUTUBE_CHANNEL_HANDLE_2 = re_social_path_adv('youtube.com', 'c', RE_ID)
URL_PARSE_YOUTUBE_CHANNEL_ID = re_social_path_adv('youtube.com', 'channel', RE_ID) URL_PARSE_YOUTUBE_CHANNEL_ID = re_social_path_adv('youtube.com', 'channel', RE_ID)
URL_PARSE_VIMEO_CHANNEL = re_social_path_adv('vimeo.com', RE_ID) URL_PARSE_VIMEO_CHANNEL = re_social_path_adv('vimeo.com', RE_ID)
@ -281,7 +284,7 @@ URL_PARSE_DANBOORU_ARTIST = re_social_path_adv('danbooru.donmai.us', 'artists',
URL_PARSE_BANDCAMP = re_social_subdomain('bandcamp.com') URL_PARSE_BANDCAMP = re_social_subdomain('bandcamp.com')
URL_PARSE_BLUESKY = re_social_path_adv('bsky.app', 'profile', RE_ID) URL_PARSE_BLUESKY = re_social_path_adv('bsky.app', 'profile', RE_ID)
REGEXES: list[tuple[str, SocialSiteId]] = [ REGEXES: list[tuple[str, object]] = [
# Reddit # Reddit
(REDDIT_SUBREDDIT_URL, SocialSiteId.REDDIT_SUBREDDIT), (REDDIT_SUBREDDIT_URL, SocialSiteId.REDDIT_SUBREDDIT),
(REDDIT_USER_URL, SocialSiteId.REDDIT_USER), (REDDIT_USER_URL, SocialSiteId.REDDIT_USER),
@ -316,7 +319,8 @@ REGEXES: list[tuple[str, SocialSiteId]] = [
# Patreon # Patreon
(PATREON_URL, SocialSiteId.PATREON_PAGE), (PATREON_URL, SocialSiteId.PATREON_PAGE),
# Artstation # Artstation
(ARTSTATION_URL, SocialSiteId.ARTSTATION_PAGE), (ARTSTATION_URL_1, SocialSiteId.ARTSTATION_PAGE),
(ARTSTATION_URL_2, SocialSiteId.ARTSTATION_PAGE),
# Inprnt # Inprnt
(INPRNT_URL, SocialSiteId.INPRNT_PAGE), (INPRNT_URL, SocialSiteId.INPRNT_PAGE),
# Email # Email
@ -371,7 +375,10 @@ REGEXES: list[tuple[str, SocialSiteId]] = [
SocialSiteId.STEAM_APPLICATION_ID, SocialSiteId.STEAM_APPLICATION_ID,
), ),
# Github # Github
(re_social_path_adv('github.com', RE_DUAL_ID), SocialSiteId.GITHUB_REPOSITORY), (
re_social_path_adv('github.com', RE_DUAL_ID, RE_ANY_SUBPATH),
SocialSiteId.GITHUB_REPOSITORY,
),
# Plurk # Plurk
(re_social_path_adv('plurk.com', RE_ID), SocialSiteId.PLURK), (re_social_path_adv('plurk.com', RE_ID), SocialSiteId.PLURK),
# Linked in # Linked in

View File

@ -2,7 +2,7 @@ import pytest
from socials_util import * from socials_util import *
PARSABLE_SOCIAL_IDS_COMBINED = [ PARSABLE_SOCIAL_IDS_COMBINED: list[tuple[str, object, str]] = [
# Tumblr formats # Tumblr formats
('https://triviallytrue.tumblr.com/', SocialSiteId.TUMBLR, 'triviallytrue'), ('https://triviallytrue.tumblr.com/', SocialSiteId.TUMBLR, 'triviallytrue'),
('https://tumblr.com/triviallytrue', SocialSiteId.TUMBLR, 'triviallytrue'), ('https://tumblr.com/triviallytrue', SocialSiteId.TUMBLR, 'triviallytrue'),
@ -19,7 +19,8 @@ PARSABLE_SOCIAL_IDS_COMBINED = [
SocialSiteId.TUMBLR, SocialSiteId.TUMBLR,
'triviallytrue', 'triviallytrue',
), ),
('http://worstdril.tumblr.com/', SocialSiteId.TUMBLR, 'worstdril'),
('https://deep-dark-fears.tumblr.com', SocialSiteId.TUMBLR, 'deep-dark-fears'),
# Cohost formats # Cohost formats
( (
'https://cohost.org/andrewelmore?page=0', 'https://cohost.org/andrewelmore?page=0',
@ -31,58 +32,161 @@ PARSABLE_SOCIAL_IDS_COMBINED = [
SocialSiteId.COHOST_PROFILE, SocialSiteId.COHOST_PROFILE,
'andrewelmore', 'andrewelmore',
), ),
# Reddit formats # Reddit formats
('https://old.reddit.com/user/Harpsibored/submitted/', (
'https://old.reddit.com/user/Harpsibored/submitted/',
SocialSiteId.REDDIT_USER, SocialSiteId.REDDIT_USER,
'Harpsibored', 'Harpsibored',
), ),
('https://old.reddit.com/user/Harpsibored/submitted', (
'https://old.reddit.com/user/Harpsibored/submitted',
SocialSiteId.REDDIT_USER, SocialSiteId.REDDIT_USER,
'Harpsibored', 'Harpsibored',
), ),
('https://old.reddit.com/user/Harpsibored/', (
'https://old.reddit.com/user/Harpsibored/',
SocialSiteId.REDDIT_USER, SocialSiteId.REDDIT_USER,
'Harpsibored', 'Harpsibored',
), ),
('https://old.reddit.com/user/Harpsibored', (
'https://old.reddit.com/user/Harpsibored',
SocialSiteId.REDDIT_USER, SocialSiteId.REDDIT_USER,
'Harpsibored', 'Harpsibored',
), ),
# Ko-fi formats
('https://ko-fi.com/A627LI1/shop/', SocialSiteId.KO_FI, 'A627LI1'),
('https://ko-fi.com/A627LI1/shop', SocialSiteId.KO_FI, 'A627LI1'),
('https://ko-fi.com/A627LI1/', SocialSiteId.KO_FI, 'A627LI1'),
('https://ko-fi.com/A627LI1', SocialSiteId.KO_FI, 'A627LI1'),
# Twitter formats
('http://twitter.com/dril', SocialSiteId.TWITTER, 'dril'),
('http://www.twitter.com/dril', SocialSiteId.TWITTER, 'dril'),
('http://www.x.com/dril', SocialSiteId.TWITTER, 'dril'),
('http://x.com/dril', SocialSiteId.TWITTER, 'dril'),
('http://twitter.com/dril/media', SocialSiteId.TWITTER, 'dril'),
('http://www.twitter.com/dril/media', SocialSiteId.TWITTER, 'dril'),
('http://www.x.com/dril/media', SocialSiteId.TWITTER, 'dril'),
('http://x.com/dril/media', SocialSiteId.TWITTER, 'dril'),
# Wikidata formats
('https://wikidata.org/wiki/Q594400', SocialSiteId.WIKIDATA, 'Q594400'),
('https://m.wikidata.org/wiki/Q594400', SocialSiteId.WIKIDATA, 'Q594400'),
# YouTube formats
(
'https://youtube.com/@WheelieYellow',
SocialSiteId.YOUTUBE_CHANNEL_HANDLE,
'WheelieYellow',
),
(
'https://youtube.com/@WheelieYellow/',
SocialSiteId.YOUTUBE_CHANNEL_HANDLE,
'WheelieYellow',
),
(
'https://www.youtube.com/@WheelieYellow',
SocialSiteId.YOUTUBE_CHANNEL_HANDLE,
'WheelieYellow',
),
(
'https://www.youtube.com/@WheelieYellow/',
SocialSiteId.YOUTUBE_CHANNEL_HANDLE,
'WheelieYellow',
),
(
'https://www.youtube.com/@WheelieYellow/featured',
SocialSiteId.YOUTUBE_CHANNEL_HANDLE,
'WheelieYellow',
),
# GitHub
('https://github.com/love2d/love', SocialSiteId.GITHUB_REPOSITORY, 'love2d/love'),
('https://github.com/love2d/love/', SocialSiteId.GITHUB_REPOSITORY, 'love2d/love'),
(
'https://github.com/love2d/love/releases',
SocialSiteId.GITHUB_REPOSITORY,
'love2d/love',
),
# ArtStation
('https://toraji.artstation.com', SocialSiteId.ARTSTATION_PAGE, 'toraji'),
('https://www.artstation.com/toraji', SocialSiteId.ARTSTATION_PAGE, 'toraji'),
(
'https://www.artstation.com/toraji/profile',
SocialSiteId.ARTSTATION_PAGE,
'toraji',
),
# Tiktok
(
'https://tiktok.com/@depthsofwikipedia',
SocialSiteId.TIKTOK_USER,
'depthsofwikipedia',
),
(
'https://www.tiktok.com/@depthsofwikipedia',
SocialSiteId.TIKTOK_USER,
'depthsofwikipedia',
),
(
'https://www.tiktok.com/@depthsofwikipedia?lang=en',
SocialSiteId.TIKTOK_USER,
'depthsofwikipedia',
),
# Instagram
(
'https://instagram.com/_richardparry_',
SocialSiteId.INSTAGRAM_PAGE,
'_richardparry_',
),
('https://instagram.com/j_kmor/', SocialSiteId.INSTAGRAM_PAGE, 'j_kmor'),
(
'https://instagram.com/cullensartbox/',
SocialSiteId.INSTAGRAM_PAGE,
'cullensartbox',
),
(
'https://www.instagram.com/timkongart/',
SocialSiteId.INSTAGRAM_PAGE,
'timkongart',
),
('https://www.instagram.com/kcn.wu/', SocialSiteId.INSTAGRAM_PAGE, 'kcn.wu'),
(
'https://www.instagram.com/itsbettyjiang',
SocialSiteId.INSTAGRAM_PAGE,
'itsbettyjiang',
),
# Facebook
(
'https://www.facebook.com/fredagscafeen.dk/',
SocialSiteId.FACEBOOK_PAGE,
'fredagscafeen.dk',
),
# Pixiv
('https://www.pixiv.net/users/14866303', SocialSiteId.PIXIV_USER_ID, '14866303'),
(
'https://www.pixiv.net/member.php?id=109710',
SocialSiteId.PIXIV_USER_ID,
'109710',
),
# Etsy
(
'https://www.etsy.com/shop/aleksiremesart',
SocialSiteId.ETSY_SHOP,
'aleksiremesart',
),
# Deviantart
(
'https://www.deviantart.com/solquiet',
SocialSiteId.DEVIANT_ART_ACCOUNT,
'solquiet',
),
('https://solquiet.deviantart.com/', SocialSiteId.DEVIANT_ART_ACCOUNT, 'solquiet'),
] ]
PARSABLE_SOCIAL_IDS = [
('http://www.twitter.com/dril', 'dril'),
('http://worstdril.tumblr.com/', 'worstdril'),
('https://deep-dark-fears.tumblr.com', 'deep-dark-fears'),
('https://www.etsy.com/shop/aleksiremesart', 'aleksiremesart'),
('https://ko-fi.com/A627LI1/shop', 'A627LI1'),
('https://ko-fi.com/A627LI1/', 'A627LI1'),
('https://www.facebook.com/fredagscafeen.dk/', 'fredagscafeen.dk'),
('https://www.tiktok.com/@depthsofwikipedia?lang=en', 'depthsofwikipedia'),
('https://www.pixiv.net/users/14866303', '14866303'),
('https://www.pixiv.net/member.php?id=109710', '109710'),
] + [(a, c) for (a, b, c) in PARSABLE_SOCIAL_IDS_COMBINED]
PARSABLE_SOCIAL_SITE_IDS = [ @pytest.mark.parametrize(
('https://www.deviantart.com/solquiet', SocialSiteId.DEVIANT_ART_ACCOUNT), 'url,expected_social_site_id,expected_social_id', PARSABLE_SOCIAL_IDS_COMBINED
('https://solquiet.deviantart.com/', SocialSiteId.DEVIANT_ART_ACCOUNT), )
('https://instagram.com/_richardparry_', SocialSiteId.INSTAGRAM_PAGE), def test_parse_social_ids(url, expected_social_site_id, expected_social_id):
('https://instagram.com/j_kmor/', SocialSiteId.INSTAGRAM_PAGE), social_link: SocialLink | None = determine_social_from_url(url)
('https://instagram.com/cullensartbox/', SocialSiteId.INSTAGRAM_PAGE), assert social_link is not None, url
('https://www.instagram.com/timkongart/', SocialSiteId.INSTAGRAM_PAGE), assert (social_link.social_id, social_link.social_site_id) == (
('https://www.instagram.com/kcn.wu/', SocialSiteId.INSTAGRAM_PAGE), expected_social_id,
('https://www.instagram.com/itsbettyjiang', SocialSiteId.INSTAGRAM_PAGE), expected_social_site_id,
] + [(a, b) for (a, b, c) in PARSABLE_SOCIAL_IDS_COMBINED] ), url
@pytest.mark.parametrize('url,expected_social_id', PARSABLE_SOCIAL_IDS)
def test_parse_social_ids(url, expected_social_id):
social_link = determine_social_from_url(url)
assert social_link.social_id == expected_social_id, url
assert social_link.social_site_id is not None, url
@pytest.mark.parametrize('url,expected_social_site_id', PARSABLE_SOCIAL_SITE_IDS)
def test_parse_social_site_ids(url, expected_social_site_id):
assert determine_social_from_url(url).social_site_id == expected_social_site_id, url