|
|
|
@ -1,22 +1,17 @@
|
|
|
|
|
"""
|
|
|
|
|
Small utility for detecting social websites.
|
|
|
|
|
"""
|
|
|
|
|
"""Small utility for detecting social websites."""
|
|
|
|
|
|
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
from enforce_typing import enforce_types
|
|
|
|
|
from typing import List, Set, Optional, Union
|
|
|
|
|
import aenum
|
|
|
|
|
import datetime
|
|
|
|
|
import re
|
|
|
|
|
import urllib.parse
|
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
|
|
|
|
|
from socials_util._version import __version__
|
|
|
|
|
import aenum
|
|
|
|
|
from enforce_typing import enforce_types
|
|
|
|
|
|
|
|
|
|
from socials_util._version import __version__ # noqa: F401
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SocialSiteId(aenum.Enum):
|
|
|
|
|
"""
|
|
|
|
|
The great social website enum.
|
|
|
|
|
"""
|
|
|
|
|
"""The great social website enum."""
|
|
|
|
|
|
|
|
|
|
# Reddit-like
|
|
|
|
|
REDDIT = 1 # Should have been named REDDIT_SUBREDDIT
|
|
|
|
@ -103,22 +98,22 @@ AGGERAGOR_SOCIALS = {
|
|
|
|
|
|
|
|
|
|
@enforce_types
|
|
|
|
|
@dataclass(frozen=True)
|
|
|
|
|
class SocialLink(object):
|
|
|
|
|
class SocialLink:
|
|
|
|
|
url: urllib.parse.ParseResult
|
|
|
|
|
social_site_id: SocialSiteId
|
|
|
|
|
social_id: Optional[str]
|
|
|
|
|
social_id: str | None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@enforce_types
|
|
|
|
|
@dataclass(frozen=True)
|
|
|
|
|
class WikidataInfo(object):
|
|
|
|
|
property_id: Optional[int]
|
|
|
|
|
issuer_id: Optional[int]
|
|
|
|
|
id_version_of: Optional[SocialSiteId] = None
|
|
|
|
|
nickname_version_of: Optional[SocialSiteId] = None
|
|
|
|
|
class WikidataInfo:
|
|
|
|
|
property_id: int | None
|
|
|
|
|
issuer_id: int | None
|
|
|
|
|
id_version_of: SocialSiteId | None = None
|
|
|
|
|
nickname_version_of: SocialSiteId | None = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
WIKIDATA_PROPERTIES = {
|
|
|
|
|
WIKIDATA_PROPERTIES: dict[SocialSiteId, WikidataInfo] = {
|
|
|
|
|
SocialSiteId.EMAIL: WikidataInfo(968, None),
|
|
|
|
|
SocialSiteId.RSS_FEED: WikidataInfo(1079, None),
|
|
|
|
|
SocialSiteId.FACEBOOK_PAGE: WikidataInfo(2013, None),
|
|
|
|
@ -134,10 +129,14 @@ WIKIDATA_PROPERTIES = {
|
|
|
|
|
SocialSiteId.TUMBLR: WikidataInfo(3943, None),
|
|
|
|
|
SocialSiteId.TIKTOK_USER: WikidataInfo(7085, None),
|
|
|
|
|
SocialSiteId.PIXIV_USER_ID: WikidataInfo(
|
|
|
|
|
5435, 306956, id_version_of=SocialSiteId.PIXIV_USER_NICKNAME
|
|
|
|
|
5435,
|
|
|
|
|
306956,
|
|
|
|
|
id_version_of=SocialSiteId.PIXIV_USER_NICKNAME,
|
|
|
|
|
),
|
|
|
|
|
SocialSiteId.PIXIV_USER_NICKNAME: WikidataInfo(
|
|
|
|
|
None, 306956, nickname_version_of=SocialSiteId.PIXIV_USER_ID
|
|
|
|
|
None,
|
|
|
|
|
306956,
|
|
|
|
|
nickname_version_of=SocialSiteId.PIXIV_USER_ID,
|
|
|
|
|
),
|
|
|
|
|
SocialSiteId.MASTODON_PAGE: WikidataInfo(4033, None),
|
|
|
|
|
SocialSiteId.PATREON_PAGE: WikidataInfo(4175, 15861362),
|
|
|
|
@ -145,10 +144,14 @@ WIKIDATA_PROPERTIES = {
|
|
|
|
|
# SocialSiteId.INPRNT_PAGE: WikidataInfo(None, None),
|
|
|
|
|
SocialSiteId.CARRD_PAGE: WikidataInfo(None, 106036503),
|
|
|
|
|
SocialSiteId.YOUTUBE_CHANNEL_HANDLE: WikidataInfo(
|
|
|
|
|
11245, 866, nickname_version_of=SocialSiteId.YOUTUBE_CHANNEL_ID
|
|
|
|
|
11245,
|
|
|
|
|
866,
|
|
|
|
|
nickname_version_of=SocialSiteId.YOUTUBE_CHANNEL_ID,
|
|
|
|
|
),
|
|
|
|
|
SocialSiteId.YOUTUBE_CHANNEL_ID: WikidataInfo(
|
|
|
|
|
2397, 866, id_version_of=SocialSiteId.YOUTUBE_CHANNEL_HANDLE
|
|
|
|
|
2397,
|
|
|
|
|
866,
|
|
|
|
|
id_version_of=SocialSiteId.YOUTUBE_CHANNEL_HANDLE,
|
|
|
|
|
),
|
|
|
|
|
SocialSiteId.VIMEO_CHANNEL: WikidataInfo(4015, 156376),
|
|
|
|
|
SocialSiteId.NEWGROUNDS_PAGE: WikidataInfo(None, 263655),
|
|
|
|
@ -184,8 +187,7 @@ WIKIDATA_PROPERTIES = {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def re_social_subdomain(main_domain):
|
|
|
|
|
# return r'^(?:https?:\/\/)?([\w_-]+)\.'+re.escape(main_domain)+'\/?$'
|
|
|
|
|
def re_social_subdomain(main_domain: str) -> str:
|
|
|
|
|
return r'^(?:https?:\/\/)?([\w_-]+)\.' + re.escape(main_domain) + r'(\/.*)?$'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -194,23 +196,30 @@ RE_DUAL_ID = r'@?([^/]+/[^/]+)'
|
|
|
|
|
RE_ANY_SUBPATH = r'(|\/|\/.*)$'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def re_social_path(main_domain):
|
|
|
|
|
# return r'^(?:https?:\/\/)?(?:www\.)?'+re.escape(main_domain)+'\/'+RE_ID+'\/?$'
|
|
|
|
|
def re_social_path(main_domain: str) -> str:
|
|
|
|
|
return re_social_path_adv(main_domain, RE_ID)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def re_social_path_adv(main_domain, *path):
|
|
|
|
|
assert not main_domain.startswith('www.'), 'Redundant www.'
|
|
|
|
|
l = [r'^', r'(?:https?:\/\/)?', r'(?:www\.)?', re.escape(main_domain)]
|
|
|
|
|
def re_social_path_adv(main_domain: str, *path: str) -> str:
|
|
|
|
|
if main_domain.startswith('www.'):
|
|
|
|
|
msg = f'Redundant www: {main_domain}'
|
|
|
|
|
raise ValueError(msg)
|
|
|
|
|
regex_builder: list[str] = [
|
|
|
|
|
r'^',
|
|
|
|
|
r'(?:https?:\/\/)?',
|
|
|
|
|
r'(?:www\.)?',
|
|
|
|
|
re.escape(main_domain),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
for p in path:
|
|
|
|
|
if p != RE_ANY_SUBPATH:
|
|
|
|
|
l.append(r'\/')
|
|
|
|
|
l.append(p if p in {RE_ID, RE_DUAL_ID, RE_ANY_SUBPATH} else re.escape(p))
|
|
|
|
|
regex_builder.append(r'\/')
|
|
|
|
|
regex_builder.append(
|
|
|
|
|
p if p in {RE_ID, RE_DUAL_ID, RE_ANY_SUBPATH} else re.escape(p),
|
|
|
|
|
)
|
|
|
|
|
if path[-1] != RE_ANY_SUBPATH:
|
|
|
|
|
l.append(r'\/?$')
|
|
|
|
|
regex = ''.join(l)
|
|
|
|
|
return regex
|
|
|
|
|
regex_builder.append(r'\/?$')
|
|
|
|
|
return ''.join(regex_builder)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MAILTO_URL = r'^mailto:(?:[\w._.]+@[\w._.]+)$'
|
|
|
|
@ -253,10 +262,16 @@ URL_PARSE_YOUTUBE_CHANNEL_ID = re_social_path_adv('youtube.com', 'channel', RE_I
|
|
|
|
|
URL_PARSE_VIMEO_CHANNEL = re_social_path_adv('vimeo.com', RE_ID)
|
|
|
|
|
URL_PARSE_NEWGROUNDS_PAGE = re_social_subdomain('newgrounds.com')
|
|
|
|
|
URL_PARSE_ARTSY_ARTIST = re_social_path_adv(
|
|
|
|
|
'artsy.net', 'artist', RE_ID, RE_ANY_SUBPATH
|
|
|
|
|
'artsy.net',
|
|
|
|
|
'artist',
|
|
|
|
|
RE_ID,
|
|
|
|
|
RE_ANY_SUBPATH,
|
|
|
|
|
)
|
|
|
|
|
URL_PARSE_ARTNET_ARTIST = re_social_path_adv(
|
|
|
|
|
'artnet.com', 'artists', RE_ID, RE_ANY_SUBPATH
|
|
|
|
|
'artnet.com',
|
|
|
|
|
'artists',
|
|
|
|
|
RE_ID,
|
|
|
|
|
RE_ANY_SUBPATH,
|
|
|
|
|
)
|
|
|
|
|
URL_PARSE_DEVIANT_ART_ACCOUNT = re_social_path_adv('deviantart.com', RE_ID)
|
|
|
|
|
URL_PARSE_DEVIANT_ART_ACCOUNT_2 = re_social_subdomain('deviantart.com')
|
|
|
|
@ -264,7 +279,7 @@ URL_PARSE_DANBOORU_ARTIST = re_social_path_adv('danbooru.donmai.us', 'artists',
|
|
|
|
|
URL_PARSE_BANDCAMP = re_social_subdomain('bandcamp.com')
|
|
|
|
|
URL_PARSE_BLUESKY = re_social_path_adv('bsky.app', 'profile', RE_ID)
|
|
|
|
|
|
|
|
|
|
REGEXES = [
|
|
|
|
|
REGEXES: list[tuple[str, SocialSiteId]] = [
|
|
|
|
|
# Reddit
|
|
|
|
|
(REDDIT_SUBREDDIT_URL, SocialSiteId.REDDIT_SUBREDDIT),
|
|
|
|
|
(REDDIT_USER_URL, SocialSiteId.REDDIT_USER),
|
|
|
|
@ -364,7 +379,7 @@ REGEXES = [
|
|
|
|
|
(re_social_subdomain('blogspot.com'), SocialSiteId.GOOGLE_BLOGGER_PAGE),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
WELL_KNOWN_MASTODON_INSTANCES = frozenset(
|
|
|
|
|
WELL_KNOWN_MASTODON_INSTANCES: frozenset[str] = frozenset(
|
|
|
|
|
{
|
|
|
|
|
# Includes all servers with 50 000+ users as of 6 july 2023.
|
|
|
|
|
# based on https://mastodonservers.net/servers/top
|
|
|
|
@ -384,12 +399,16 @@ WELL_KNOWN_MASTODON_INSTANCES = frozenset(
|
|
|
|
|
'mastodonapp.uk',
|
|
|
|
|
'fosstodon.org',
|
|
|
|
|
'idlethumbs.social',
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def determine_social_from_url_internally(url: str):
|
|
|
|
|
assert isinstance(url, str)
|
|
|
|
|
def determine_social_from_url_internally(
|
|
|
|
|
url: str,
|
|
|
|
|
) -> tuple[SocialSiteId | None, str | None]:
|
|
|
|
|
if not isinstance(url, str):
|
|
|
|
|
msg = f'Url must be {str}'
|
|
|
|
|
raise TypeError(msg)
|
|
|
|
|
|
|
|
|
|
# Regexes
|
|
|
|
|
for social_site_url_regex, social_site_id in REGEXES:
|
|
|
|
@ -405,21 +424,33 @@ def determine_social_from_url_internally(url: str):
|
|
|
|
|
return (SocialSiteId.MASTODON_PAGE, None)
|
|
|
|
|
|
|
|
|
|
# Feed (?)
|
|
|
|
|
elif 'feed' in url or 'xml' in url or 'rss' in url or 'atom' in url:
|
|
|
|
|
if 'feed' in url or 'xml' in url or 'rss' in url or 'atom' in url:
|
|
|
|
|
return (SocialSiteId.RSS_FEED, None)
|
|
|
|
|
|
|
|
|
|
return (None, None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def determine_social_from_url(url):
|
|
|
|
|
def to_parse_result(url: str | urllib.parse.ParseResult) -> urllib.parse.ParseResult:
|
|
|
|
|
if isinstance(url, str):
|
|
|
|
|
url = urllib.parse.urlparse(url)
|
|
|
|
|
return urllib.parse.urlparse(url)
|
|
|
|
|
if isinstance(url, urllib.parse.ParseResult):
|
|
|
|
|
return url
|
|
|
|
|
|
|
|
|
|
# Throw error
|
|
|
|
|
msg = f'Expected {urllib.parse.ParseResult} or {str}'
|
|
|
|
|
raise TypeError(msg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def determine_social_from_url(
|
|
|
|
|
url_not_normalized: str | urllib.parse.ParseResult,
|
|
|
|
|
) -> SocialLink | None:
|
|
|
|
|
url = to_parse_result(url_not_normalized)
|
|
|
|
|
(social_site_id, social_id) = determine_social_from_url_internally(
|
|
|
|
|
url._replace(query='', fragment='').geturl()
|
|
|
|
|
url._replace(query='', fragment='').geturl(),
|
|
|
|
|
)
|
|
|
|
|
if not social_site_id:
|
|
|
|
|
(social_site_id, social_id) = determine_social_from_url_internally(
|
|
|
|
|
url._replace(fragment='').geturl()
|
|
|
|
|
url._replace(fragment='').geturl(),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if not social_site_id:
|
|
|
|
|