Pixiv has too many link formats
This commit is contained in:
parent
bdf60219fd
commit
2bdd5b43fa
68
__init__.py
68
__init__.py
|
@ -32,7 +32,8 @@ class SocialSiteId(enum.Enum):
|
|||
KO_FI = 20
|
||||
BEHANCE_PAGE = 21
|
||||
TIKTOK_USER = 7085
|
||||
PIXIV_USER = 23
|
||||
PIXIV_USER_ID = 5435
|
||||
PIXIV_USER_NICKNAME = 31
|
||||
CARRD_PAGE = 24
|
||||
HENTAI_FOUNDRY = 25
|
||||
YOUTUBE_CHANNEL_HANDLE = 26
|
||||
|
@ -42,6 +43,8 @@ class SocialSiteId(enum.Enum):
|
|||
ARTSY_ARTIST = 2042
|
||||
LINK_COLLECTION_PAGE = 29
|
||||
DEVIANT_ART_ACCOUNT = 7737
|
||||
DANBOORU_ARTIST = 30
|
||||
BANDCAMP_PROFILE = 3283
|
||||
|
||||
def wikidata_property(self, client):
|
||||
return client.get(WIKIDATA_PROPERTIES[self])
|
||||
|
@ -49,7 +52,13 @@ class SocialSiteId(enum.Enum):
|
|||
def is_aggregator(self):
|
||||
return self in AGGERAGOR_SOCIALS
|
||||
|
||||
AGGERAGOR_SOCIALS = {SocialSiteId.LINKTREE_PAGE, SocialSiteId.WIKIDATA}
|
||||
AGGERAGOR_SOCIALS = {
|
||||
SocialSiteId.LINKTREE_PAGE,
|
||||
SocialSiteId.WIKIDATA,
|
||||
SocialSiteId.CARRD_PAGE,
|
||||
SocialSiteId.LINK_COLLECTION_PAGE,
|
||||
SocialSiteId.DANBOORU_ARTIST,
|
||||
}
|
||||
|
||||
@enforce_types
|
||||
@dataclass(frozen = True)
|
||||
|
@ -63,6 +72,8 @@ class SocialLink(object):
|
|||
class WikidataInfo(object):
|
||||
property_id: Optional[int]
|
||||
issuer_id: Optional[int]
|
||||
id_version_of: Optional[SocialSiteId] = None
|
||||
nickname_version_of: Optional[SocialSiteId] = None
|
||||
|
||||
WIKIDATA_PROPERTIES = {
|
||||
SocialSiteId.EMAIL: WikidataInfo(968, None),
|
||||
|
@ -82,7 +93,9 @@ WIKIDATA_PROPERTIES = {
|
|||
SocialSiteId.TUMBLR: WikidataInfo(3943, None),
|
||||
SocialSiteId.TIKTOK_USER: WikidataInfo(7085, None),
|
||||
|
||||
SocialSiteId.PIXIV_USER: WikidataInfo(None, 306956),
|
||||
SocialSiteId.PIXIV_USER_ID: WikidataInfo(5435, 306956, id_version_of = SocialSiteId.PIXIV_USER_NICKNAME),
|
||||
SocialSiteId.PIXIV_USER_NICKNAME: WikidataInfo(None, 306956, nickname_version_of = SocialSiteId.PIXIV_USER_ID),
|
||||
|
||||
SocialSiteId.MASTODON_PAGE: WikidataInfo(4033, None),
|
||||
SocialSiteId.PATREON_PAGE: WikidataInfo(4175, 15861362),
|
||||
SocialSiteId.ARTSTATION_PAGE: WikidataInfo(None, 65551500),
|
||||
|
@ -90,12 +103,14 @@ WIKIDATA_PROPERTIES = {
|
|||
|
||||
SocialSiteId.CARRD_PAGE: WikidataInfo(None, 106036503),
|
||||
SocialSiteId.HENTAI_FOUNDRY: WikidataInfo(None, 115903301),
|
||||
SocialSiteId.YOUTUBE_CHANNEL_HANDLE: WikidataInfo(11245, 866),
|
||||
SocialSiteId.YOUTUBE_CHANNEL_ID: WikidataInfo(2397, None),
|
||||
SocialSiteId.YOUTUBE_CHANNEL_HANDLE: WikidataInfo(11245, 866, nickname_version_of = SocialSiteId.YOUTUBE_CHANNEL_ID),
|
||||
SocialSiteId.YOUTUBE_CHANNEL_ID: WikidataInfo(2397, 866, id_version_of = SocialSiteId.YOUTUBE_CHANNEL_HANDLE),
|
||||
SocialSiteId.VIMEO_CHANNEL: WikidataInfo(4015, 156376),
|
||||
SocialSiteId.NEWGROUNDS_PAGE: WikidataInfo(None, 263655),
|
||||
SocialSiteId.ARTSY_ARTIST: WikidataInfo(2042, 4796642),
|
||||
SocialSiteId.DEVIANT_ART_ACCOUNT: WikidataInfo(7737, None),
|
||||
SocialSiteId.DANBOORU_ARTIST: WikidataInfo(None, 64514853),
|
||||
SocialSiteId.BANDCAMP_PROFILE: WikidataInfo(3283, 545966),
|
||||
}
|
||||
|
||||
def re_social_subdomain(main_domain):
|
||||
|
@ -103,6 +118,7 @@ def re_social_subdomain(main_domain):
|
|||
return r'^(?:https?:\/\/)?([\w_-]+)\.'+re.escape(main_domain)+'(\/.*)?$'
|
||||
|
||||
RE_ID = r'@?([^/]+)'
|
||||
RE_ANY_SUBPATH = r'(|\/|\/.*)$'
|
||||
|
||||
def re_social_path(main_domain):
|
||||
#return r'^(?:https?:\/\/)?(?:www\.)?'+re.escape(main_domain)+'\/'+RE_ID+'\/?$'
|
||||
|
@ -113,10 +129,13 @@ def re_social_path_adv(main_domain, *path):
|
|||
l = [r'^', '(?:https?:\/\/)?', r'(?:www\.)?', re.escape(main_domain)]
|
||||
|
||||
for p in path:
|
||||
l.append(r'\/')
|
||||
l.append(RE_ID if p == RE_ID else re.escape(p))
|
||||
l.append(r'\/?$')
|
||||
return ''.join(l)
|
||||
if p != RE_ANY_SUBPATH:
|
||||
l.append(r'\/')
|
||||
l.append(p if p in {RE_ID, RE_ANY_SUBPATH} else re.escape(p))
|
||||
if path[-1] != RE_ANY_SUBPATH:
|
||||
l.append(r'\/?$')
|
||||
regex = ''.join(l)
|
||||
return regex
|
||||
|
||||
MAILTO_URL = r'^mailto:(?:[\w._.]+@[\w._.]+)$'
|
||||
|
||||
|
@ -130,7 +149,7 @@ SONGKICK_ARTIST_URL = r'^(?:https?:\/\/)?(?:www\.)?songkick\.com\/artists\/(\d+)
|
|||
TUMBLR_PAGE_URL = re_social_path('tumblr.com')
|
||||
TUMBLR_PAGE_URL_2 = re_social_subdomain('tumblr.com')
|
||||
INSTAGRAM_URL = re_social_path('instagram.com')
|
||||
PATREON_URL = re_social_path('patreon.com')
|
||||
PATREON_URL = re_social_path_adv('patreon.com', RE_ID, RE_ANY_SUBPATH)
|
||||
ARTSTATION_URL = re_social_path('artstation.com')
|
||||
INPRNT_URL = re_social_path_adv('inprnt.com', 'gallery', RE_ID)
|
||||
FACEBOOK_PAGE_URL = re_social_path('facebook.com')
|
||||
|
@ -138,11 +157,14 @@ SUBSTACK_PREFIX_URL = re_social_subdomain('substack.com')
|
|||
ETSY_SHOP_URL = re_social_path_adv('etsy.com', 'shop', RE_ID)
|
||||
BEHANCE_PAGE_URL = re_social_path('behance.net')
|
||||
TIKTOK_USER_URL = re_social_path('tiktok.com')
|
||||
PIXIV_USER_URL = r'^(?:https?:\/\/)?(?:www\.)?pixiv\.net(?:\/en)?\/users/(\d+)\/?$'
|
||||
PIXIV_USER_URL_2 = r'^(?:https?:\/\/)?(?:www\.)?pixiv\.net(?:\/en)?\/member\.php\/?[?]id=(\d+)$'
|
||||
PIXIV_USER_ID_URL = r'^(?:https?:\/\/)?(?:www\.)?pixiv\.net(?:\/en)?\/users/(\d+)\/?$'
|
||||
PIXIV_USER_ID_URL_2 = r'^(?:https?:\/\/)?(?:www\.)?pixiv\.net(?:\/en)?\/member\.php\/?[?]id=(\d+)$'
|
||||
PIXIV_FANBOX_USER_NICKNAME_URL = re_social_subdomain('fanbox.cc')
|
||||
PIXIV_USER_NICKNAME_URL = re_social_path_adv('pixiv.net', 'stacc', RE_ID)
|
||||
PIXIV_SKETCH_USER_NICKNAME_URL = re_social_path_adv('sketch.pixiv.net', RE_ID)
|
||||
|
||||
URL_PARSE_CARRD_PAGE = re_social_subdomain('carrd.co')
|
||||
URL_PARSE_HENTAI_FOUNDRY= re_social_path_adv('hentai-foundry.com', 'user', RE_ID, 'profile')
|
||||
URL_PARSE_HENTAI_FOUNDRY = re_social_path_adv('hentai-foundry.com', 'user', RE_ID, RE_ANY_SUBPATH)
|
||||
URL_PARSE_YOUTUBE_CHANNEL_HANDLE_1= re_social_path_adv('youtube.com', RE_ID)
|
||||
URL_PARSE_YOUTUBE_CHANNEL_HANDLE_2= re_social_path_adv('youtube.com', 'c', RE_ID)
|
||||
URL_PARSE_YOUTUBE_CHANNEL_ID= re_social_path_adv('youtube.com', 'channel', RE_ID)
|
||||
|
@ -151,6 +173,9 @@ URL_PARSE_NEWGROUNDS_PAGE = re_social_subdomain('newgrounds.com')
|
|||
URL_PARSE_ARTSY_ARTIST = re_social_path_adv('artsy.net', 'artist', RE_ID)
|
||||
URL_PARSE_DEVIANT_ART_ACCOUNT = re_social_path_adv('deviantart.com', RE_ID)
|
||||
URL_PARSE_DEVIANT_ART_ACCOUNT_2 = re_social_subdomain('deviantart.com')
|
||||
URL_PARSE_DANBOORU_ARTIST = re_social_path_adv('danbooru.donmai.us', 'artists', RE_ID)
|
||||
URL_PARSE_BANDCAMP = re_social_subdomain('bandcamp.com')
|
||||
|
||||
# TODO: https://<ID>.deviantart.com
|
||||
|
||||
REGEXES = [
|
||||
|
@ -187,8 +212,11 @@ REGEXES = [
|
|||
(TIKTOK_USER_URL, SocialSiteId.TIKTOK_USER),
|
||||
|
||||
# Pixiv
|
||||
(PIXIV_USER_URL, SocialSiteId.PIXIV_USER),
|
||||
(PIXIV_USER_URL_2, SocialSiteId.PIXIV_USER),
|
||||
(PIXIV_USER_ID_URL, SocialSiteId.PIXIV_USER_ID),
|
||||
(PIXIV_USER_ID_URL_2, SocialSiteId.PIXIV_USER_ID),
|
||||
(PIXIV_FANBOX_USER_NICKNAME_URL , SocialSiteId.PIXIV_USER_NICKNAME),
|
||||
(PIXIV_USER_NICKNAME_URL , SocialSiteId.PIXIV_USER_NICKNAME),
|
||||
(PIXIV_SKETCH_USER_NICKNAME_URL, SocialSiteId.PIXIV_USER_NICKNAME),
|
||||
|
||||
# Patreon
|
||||
(PATREON_URL, SocialSiteId.PATREON_PAGE),
|
||||
|
@ -238,6 +266,12 @@ REGEXES = [
|
|||
# Deviant art
|
||||
(URL_PARSE_DEVIANT_ART_ACCOUNT, SocialSiteId.DEVIANT_ART_ACCOUNT),
|
||||
(URL_PARSE_DEVIANT_ART_ACCOUNT_2, SocialSiteId.DEVIANT_ART_ACCOUNT),
|
||||
|
||||
# Danbooru
|
||||
(URL_PARSE_DANBOORU_ARTIST, SocialSiteId.DANBOORU_ARTIST),
|
||||
|
||||
# Bandcamp
|
||||
(URL_PARSE_BANDCAMP, SocialSiteId.BANDCAMP_PROFILE),
|
||||
]
|
||||
|
||||
WELL_KNOWN_MASTODON_INSTANCES = frozenset({
|
||||
|
@ -260,12 +294,12 @@ WELL_KNOWN_MASTODON_INSTANCES = frozenset({
|
|||
'fosstodon.org',
|
||||
})
|
||||
|
||||
def determine_social_from_url_internally(url):
|
||||
def determine_social_from_url_internally(url: str):
|
||||
assert isinstance(url, str)
|
||||
|
||||
# Regexes
|
||||
for (social_site_url_regex, social_site_id) in REGEXES:
|
||||
if m := re.match(social_site_url_regex, url, re.I):
|
||||
if m := re.fullmatch(social_site_url_regex, url, re.I):
|
||||
groups = m.groups()
|
||||
return (social_site_id, groups[0] if len(groups) > 0 else None)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user