1
0

Pixiv has too many link formats

This commit is contained in:
Jon Michael Aanes 2023-09-02 18:29:56 +02:00
parent bdf60219fd
commit 2bdd5b43fa

View File

@ -32,7 +32,8 @@ class SocialSiteId(enum.Enum):
KO_FI = 20 KO_FI = 20
BEHANCE_PAGE = 21 BEHANCE_PAGE = 21
TIKTOK_USER = 7085 TIKTOK_USER = 7085
PIXIV_USER = 23 PIXIV_USER_ID = 5435
PIXIV_USER_NICKNAME = 31
CARRD_PAGE = 24 CARRD_PAGE = 24
HENTAI_FOUNDRY = 25 HENTAI_FOUNDRY = 25
YOUTUBE_CHANNEL_HANDLE = 26 YOUTUBE_CHANNEL_HANDLE = 26
@ -42,6 +43,8 @@ class SocialSiteId(enum.Enum):
ARTSY_ARTIST = 2042 ARTSY_ARTIST = 2042
LINK_COLLECTION_PAGE = 29 LINK_COLLECTION_PAGE = 29
DEVIANT_ART_ACCOUNT = 7737 DEVIANT_ART_ACCOUNT = 7737
DANBOORU_ARTIST = 30
BANDCAMP_PROFILE = 3283
def wikidata_property(self, client): def wikidata_property(self, client):
return client.get(WIKIDATA_PROPERTIES[self]) return client.get(WIKIDATA_PROPERTIES[self])
@ -49,7 +52,13 @@ class SocialSiteId(enum.Enum):
def is_aggregator(self): def is_aggregator(self):
return self in AGGERAGOR_SOCIALS return self in AGGERAGOR_SOCIALS
AGGERAGOR_SOCIALS = {SocialSiteId.LINKTREE_PAGE, SocialSiteId.WIKIDATA} AGGERAGOR_SOCIALS = {
SocialSiteId.LINKTREE_PAGE,
SocialSiteId.WIKIDATA,
SocialSiteId.CARRD_PAGE,
SocialSiteId.LINK_COLLECTION_PAGE,
SocialSiteId.DANBOORU_ARTIST,
}
@enforce_types @enforce_types
@dataclass(frozen = True) @dataclass(frozen = True)
@ -63,6 +72,8 @@ class SocialLink(object):
class WikidataInfo(object): class WikidataInfo(object):
property_id: Optional[int] property_id: Optional[int]
issuer_id: Optional[int] issuer_id: Optional[int]
id_version_of: Optional[SocialSiteId] = None
nickname_version_of: Optional[SocialSiteId] = None
WIKIDATA_PROPERTIES = { WIKIDATA_PROPERTIES = {
SocialSiteId.EMAIL: WikidataInfo(968, None), SocialSiteId.EMAIL: WikidataInfo(968, None),
@ -82,7 +93,9 @@ WIKIDATA_PROPERTIES = {
SocialSiteId.TUMBLR: WikidataInfo(3943, None), SocialSiteId.TUMBLR: WikidataInfo(3943, None),
SocialSiteId.TIKTOK_USER: WikidataInfo(7085, None), SocialSiteId.TIKTOK_USER: WikidataInfo(7085, None),
SocialSiteId.PIXIV_USER: WikidataInfo(None, 306956), SocialSiteId.PIXIV_USER_ID: WikidataInfo(5435, 306956, id_version_of = SocialSiteId.PIXIV_USER_NICKNAME),
SocialSiteId.PIXIV_USER_NICKNAME: WikidataInfo(None, 306956, nickname_version_of = SocialSiteId.PIXIV_USER_ID),
SocialSiteId.MASTODON_PAGE: WikidataInfo(4033, None), SocialSiteId.MASTODON_PAGE: WikidataInfo(4033, None),
SocialSiteId.PATREON_PAGE: WikidataInfo(4175, 15861362), SocialSiteId.PATREON_PAGE: WikidataInfo(4175, 15861362),
SocialSiteId.ARTSTATION_PAGE: WikidataInfo(None, 65551500), SocialSiteId.ARTSTATION_PAGE: WikidataInfo(None, 65551500),
@ -90,12 +103,14 @@ WIKIDATA_PROPERTIES = {
SocialSiteId.CARRD_PAGE: WikidataInfo(None, 106036503), SocialSiteId.CARRD_PAGE: WikidataInfo(None, 106036503),
SocialSiteId.HENTAI_FOUNDRY: WikidataInfo(None, 115903301), SocialSiteId.HENTAI_FOUNDRY: WikidataInfo(None, 115903301),
SocialSiteId.YOUTUBE_CHANNEL_HANDLE: WikidataInfo(11245, 866), SocialSiteId.YOUTUBE_CHANNEL_HANDLE: WikidataInfo(11245, 866, nickname_version_of = SocialSiteId.YOUTUBE_CHANNEL_ID),
SocialSiteId.YOUTUBE_CHANNEL_ID: WikidataInfo(2397, None), SocialSiteId.YOUTUBE_CHANNEL_ID: WikidataInfo(2397, 866, id_version_of = SocialSiteId.YOUTUBE_CHANNEL_HANDLE),
SocialSiteId.VIMEO_CHANNEL: WikidataInfo(4015, 156376), SocialSiteId.VIMEO_CHANNEL: WikidataInfo(4015, 156376),
SocialSiteId.NEWGROUNDS_PAGE: WikidataInfo(None, 263655), SocialSiteId.NEWGROUNDS_PAGE: WikidataInfo(None, 263655),
SocialSiteId.ARTSY_ARTIST: WikidataInfo(2042, 4796642), SocialSiteId.ARTSY_ARTIST: WikidataInfo(2042, 4796642),
SocialSiteId.DEVIANT_ART_ACCOUNT: WikidataInfo(7737, None), SocialSiteId.DEVIANT_ART_ACCOUNT: WikidataInfo(7737, None),
SocialSiteId.DANBOORU_ARTIST: WikidataInfo(None, 64514853),
SocialSiteId.BANDCAMP_PROFILE: WikidataInfo(3283, 545966),
} }
def re_social_subdomain(main_domain): def re_social_subdomain(main_domain):
@ -103,6 +118,7 @@ def re_social_subdomain(main_domain):
return r'^(?:https?:\/\/)?([\w_-]+)\.'+re.escape(main_domain)+'(\/.*)?$' return r'^(?:https?:\/\/)?([\w_-]+)\.'+re.escape(main_domain)+'(\/.*)?$'
RE_ID = r'@?([^/]+)' RE_ID = r'@?([^/]+)'
RE_ANY_SUBPATH = r'(|\/|\/.*)$'
def re_social_path(main_domain): def re_social_path(main_domain):
#return r'^(?:https?:\/\/)?(?:www\.)?'+re.escape(main_domain)+'\/'+RE_ID+'\/?$' #return r'^(?:https?:\/\/)?(?:www\.)?'+re.escape(main_domain)+'\/'+RE_ID+'\/?$'
@ -113,10 +129,13 @@ def re_social_path_adv(main_domain, *path):
l = [r'^', '(?:https?:\/\/)?', r'(?:www\.)?', re.escape(main_domain)] l = [r'^', '(?:https?:\/\/)?', r'(?:www\.)?', re.escape(main_domain)]
for p in path: for p in path:
l.append(r'\/') if p != RE_ANY_SUBPATH:
l.append(RE_ID if p == RE_ID else re.escape(p)) l.append(r'\/')
l.append(r'\/?$') l.append(p if p in {RE_ID, RE_ANY_SUBPATH} else re.escape(p))
return ''.join(l) if path[-1] != RE_ANY_SUBPATH:
l.append(r'\/?$')
regex = ''.join(l)
return regex
MAILTO_URL = r'^mailto:(?:[\w._.]+@[\w._.]+)$' MAILTO_URL = r'^mailto:(?:[\w._.]+@[\w._.]+)$'
@ -130,7 +149,7 @@ SONGKICK_ARTIST_URL = r'^(?:https?:\/\/)?(?:www\.)?songkick\.com\/artists\/(\d+)
TUMBLR_PAGE_URL = re_social_path('tumblr.com') TUMBLR_PAGE_URL = re_social_path('tumblr.com')
TUMBLR_PAGE_URL_2 = re_social_subdomain('tumblr.com') TUMBLR_PAGE_URL_2 = re_social_subdomain('tumblr.com')
INSTAGRAM_URL = re_social_path('instagram.com') INSTAGRAM_URL = re_social_path('instagram.com')
PATREON_URL = re_social_path('patreon.com') PATREON_URL = re_social_path_adv('patreon.com', RE_ID, RE_ANY_SUBPATH)
ARTSTATION_URL = re_social_path('artstation.com') ARTSTATION_URL = re_social_path('artstation.com')
INPRNT_URL = re_social_path_adv('inprnt.com', 'gallery', RE_ID) INPRNT_URL = re_social_path_adv('inprnt.com', 'gallery', RE_ID)
FACEBOOK_PAGE_URL = re_social_path('facebook.com') FACEBOOK_PAGE_URL = re_social_path('facebook.com')
@ -138,11 +157,14 @@ SUBSTACK_PREFIX_URL = re_social_subdomain('substack.com')
ETSY_SHOP_URL = re_social_path_adv('etsy.com', 'shop', RE_ID) ETSY_SHOP_URL = re_social_path_adv('etsy.com', 'shop', RE_ID)
BEHANCE_PAGE_URL = re_social_path('behance.net') BEHANCE_PAGE_URL = re_social_path('behance.net')
TIKTOK_USER_URL = re_social_path('tiktok.com') TIKTOK_USER_URL = re_social_path('tiktok.com')
PIXIV_USER_URL = r'^(?:https?:\/\/)?(?:www\.)?pixiv\.net(?:\/en)?\/users/(\d+)\/?$' PIXIV_USER_ID_URL = r'^(?:https?:\/\/)?(?:www\.)?pixiv\.net(?:\/en)?\/users/(\d+)\/?$'
PIXIV_USER_URL_2 = r'^(?:https?:\/\/)?(?:www\.)?pixiv\.net(?:\/en)?\/member\.php\/?[?]id=(\d+)$' PIXIV_USER_ID_URL_2 = r'^(?:https?:\/\/)?(?:www\.)?pixiv\.net(?:\/en)?\/member\.php\/?[?]id=(\d+)$'
PIXIV_FANBOX_USER_NICKNAME_URL = re_social_subdomain('fanbox.cc')
PIXIV_USER_NICKNAME_URL = re_social_path_adv('pixiv.net', 'stacc', RE_ID)
PIXIV_SKETCH_USER_NICKNAME_URL = re_social_path_adv('sketch.pixiv.net', RE_ID)
URL_PARSE_CARRD_PAGE = re_social_subdomain('carrd.co') URL_PARSE_CARRD_PAGE = re_social_subdomain('carrd.co')
URL_PARSE_HENTAI_FOUNDRY= re_social_path_adv('hentai-foundry.com', 'user', RE_ID, 'profile') URL_PARSE_HENTAI_FOUNDRY = re_social_path_adv('hentai-foundry.com', 'user', RE_ID, RE_ANY_SUBPATH)
URL_PARSE_YOUTUBE_CHANNEL_HANDLE_1= re_social_path_adv('youtube.com', RE_ID) URL_PARSE_YOUTUBE_CHANNEL_HANDLE_1= re_social_path_adv('youtube.com', RE_ID)
URL_PARSE_YOUTUBE_CHANNEL_HANDLE_2= re_social_path_adv('youtube.com', 'c', RE_ID) URL_PARSE_YOUTUBE_CHANNEL_HANDLE_2= re_social_path_adv('youtube.com', 'c', RE_ID)
URL_PARSE_YOUTUBE_CHANNEL_ID= re_social_path_adv('youtube.com', 'channel', RE_ID) URL_PARSE_YOUTUBE_CHANNEL_ID= re_social_path_adv('youtube.com', 'channel', RE_ID)
@ -151,6 +173,9 @@ URL_PARSE_NEWGROUNDS_PAGE = re_social_subdomain('newgrounds.com')
URL_PARSE_ARTSY_ARTIST = re_social_path_adv('artsy.net', 'artist', RE_ID) URL_PARSE_ARTSY_ARTIST = re_social_path_adv('artsy.net', 'artist', RE_ID)
URL_PARSE_DEVIANT_ART_ACCOUNT = re_social_path_adv('deviantart.com', RE_ID) URL_PARSE_DEVIANT_ART_ACCOUNT = re_social_path_adv('deviantart.com', RE_ID)
URL_PARSE_DEVIANT_ART_ACCOUNT_2 = re_social_subdomain('deviantart.com') URL_PARSE_DEVIANT_ART_ACCOUNT_2 = re_social_subdomain('deviantart.com')
URL_PARSE_DANBOORU_ARTIST = re_social_path_adv('danbooru.donmai.us', 'artists', RE_ID)
URL_PARSE_BANDCAMP = re_social_subdomain('bandcamp.com')
# TODO: https://<ID>.deviantart.com # TODO: https://<ID>.deviantart.com
REGEXES = [ REGEXES = [
@ -187,8 +212,11 @@ REGEXES = [
(TIKTOK_USER_URL, SocialSiteId.TIKTOK_USER), (TIKTOK_USER_URL, SocialSiteId.TIKTOK_USER),
# Pixiv # Pixiv
(PIXIV_USER_URL, SocialSiteId.PIXIV_USER), (PIXIV_USER_ID_URL, SocialSiteId.PIXIV_USER_ID),
(PIXIV_USER_URL_2, SocialSiteId.PIXIV_USER), (PIXIV_USER_ID_URL_2, SocialSiteId.PIXIV_USER_ID),
(PIXIV_FANBOX_USER_NICKNAME_URL , SocialSiteId.PIXIV_USER_NICKNAME),
(PIXIV_USER_NICKNAME_URL , SocialSiteId.PIXIV_USER_NICKNAME),
(PIXIV_SKETCH_USER_NICKNAME_URL, SocialSiteId.PIXIV_USER_NICKNAME),
# Patreon # Patreon
(PATREON_URL, SocialSiteId.PATREON_PAGE), (PATREON_URL, SocialSiteId.PATREON_PAGE),
@ -238,6 +266,12 @@ REGEXES = [
# Deviant art # Deviant art
(URL_PARSE_DEVIANT_ART_ACCOUNT, SocialSiteId.DEVIANT_ART_ACCOUNT), (URL_PARSE_DEVIANT_ART_ACCOUNT, SocialSiteId.DEVIANT_ART_ACCOUNT),
(URL_PARSE_DEVIANT_ART_ACCOUNT_2, SocialSiteId.DEVIANT_ART_ACCOUNT), (URL_PARSE_DEVIANT_ART_ACCOUNT_2, SocialSiteId.DEVIANT_ART_ACCOUNT),
# Danbooru
(URL_PARSE_DANBOORU_ARTIST, SocialSiteId.DANBOORU_ARTIST),
# Bandcamp
(URL_PARSE_BANDCAMP, SocialSiteId.BANDCAMP_PROFILE),
] ]
WELL_KNOWN_MASTODON_INSTANCES = frozenset({ WELL_KNOWN_MASTODON_INSTANCES = frozenset({
@ -260,12 +294,12 @@ WELL_KNOWN_MASTODON_INSTANCES = frozenset({
'fosstodon.org', 'fosstodon.org',
}) })
def determine_social_from_url_internally(url): def determine_social_from_url_internally(url: str):
assert isinstance(url, str) assert isinstance(url, str)
# Regexes # Regexes
for (social_site_url_regex, social_site_id) in REGEXES: for (social_site_url_regex, social_site_id) in REGEXES:
if m := re.match(social_site_url_regex, url, re.I): if m := re.fullmatch(social_site_url_regex, url, re.I):
groups = m.groups() groups = m.groups()
return (social_site_id, groups[0] if len(groups) > 0 else None) return (social_site_id, groups[0] if len(groups) > 0 else None)