1
0
socials-util/socials_util/__init__.py
2024-05-12 16:35:29 +02:00

459 lines
16 KiB
Python

"""Small utility for detecting social websites."""
import datetime
import re
import urllib.parse
from dataclasses import dataclass
from typing import List, Optional, Set, Union
import aenum
from enforce_typing import enforce_types
from socials_util._version import __version__
class SocialSiteId(aenum.Enum):
"""The great social website enum."""
# Reddit-like
REDDIT = 1 # Should have been named REDDIT_SUBREDDIT
REDDIT_USER = 22
REDDIT_SUBREDDIT = REDDIT
# Microblogging
TWITTER = 2
MASTODON_PAGE = 10
INSTAGRAM_PAGE = 11
BLUESKY_PROFILE = 12361
BLUESKY_DID = 12409
THREADS_USERNAME = 11892
COHOST_PROFILE = 117203288
# Blogs and feeds
RSS_FEED = 3
PATREON_PAGE = 12
TUMBLR = 9
SUBSTACK = 18
MEDIUM_BLOG = 3899
GOOGLE_BLOGGER_PAGE = 171186
# Video
TWITCH = 6
# Misc
PAGE_WATCH = 4
LINKTREE_PAGE = 5
WIKIDATA = 7
SONGKICK_ARTIST = 8
ARTSTATION_PAGE = 13
INPRNT_PAGE = 14
FACEBOOK_PAGE = 15
EMAIL = 16
JSON_LD = 17 # Similar to PAGE_WATCH, but focused on embedded microdata
ETSY_SHOP = 19
KO_FI = 20
BEHANCE_PAGE = 21
TIKTOK_USER = 7085
PIXIV_USER_ID = 5435
PIXIV_USER_NICKNAME = 31
CARRD_PAGE = 24
YOUTUBE_CHANNEL_HANDLE = 26
YOUTUBE_CHANNEL_ID = 2397
VIMEO_CHANNEL = 27
NEWGROUNDS_PAGE = 28
ARTSY_ARTIST = 2042
ARTNET_ARTIST = 3782
LINK_COLLECTION_PAGE = 29
DEVIANT_ART_ACCOUNT = 7737
DANBOORU_ARTIST = 30
BANDCAMP_PROFILE = 3283
ITCH_IO_DEVELOPER = 8176
SOUNDCLOUD_ARTIST = 3040
IGDB_GAME_ID = 5794
STEAM_APPLICATION_ID = 1733
GITHUB_REPOSITORY = 364
PLURK = 32111
LINKEDIN_PERSONAL_PROFILE = 6634
# Browser bookmarks
FIREFOX_PROFILE_BOOKMARKS = 33
FALKON_PROFILE_BOOKMARKS = 34
def wikidata_property(self, client):
return client.get(WIKIDATA_PROPERTIES[self])
def is_aggregator(self):
return self in AGGERAGOR_SOCIALS
AGGERAGOR_SOCIALS = {
SocialSiteId.LINKTREE_PAGE,
SocialSiteId.WIKIDATA,
SocialSiteId.CARRD_PAGE,
SocialSiteId.LINK_COLLECTION_PAGE,
SocialSiteId.DANBOORU_ARTIST,
SocialSiteId.IGDB_GAME_ID,
}
@enforce_types
@dataclass(frozen=True)
class SocialLink:
url: urllib.parse.ParseResult
social_site_id: SocialSiteId
social_id: str | None
@enforce_types
@dataclass(frozen=True)
class WikidataInfo:
property_id: int | None
issuer_id: int | None
id_version_of: SocialSiteId | None = None
nickname_version_of: SocialSiteId | None = None
WIKIDATA_PROPERTIES: dict[SocialSiteId, WikidataInfo] = {
SocialSiteId.EMAIL: WikidataInfo(968, None),
SocialSiteId.RSS_FEED: WikidataInfo(1079, None),
SocialSiteId.FACEBOOK_PAGE: WikidataInfo(2013, None),
SocialSiteId.INSTAGRAM_PAGE: WikidataInfo(2003, None),
SocialSiteId.LINKTREE_PAGE: WikidataInfo(11079, None),
SocialSiteId.REDDIT_SUBREDDIT: WikidataInfo(3984, None),
SocialSiteId.REDDIT_USER: WikidataInfo(4265, None),
SocialSiteId.RSS_FEED: WikidataInfo(1019, None),
SocialSiteId.SONGKICK_ARTIST: WikidataInfo(3478, None),
SocialSiteId.TWITCH: WikidataInfo(5797, None),
SocialSiteId.TWITTER: WikidataInfo(2002, None),
SocialSiteId.WIKIDATA: WikidataInfo(None, 2013),
SocialSiteId.TUMBLR: WikidataInfo(3943, None),
SocialSiteId.TIKTOK_USER: WikidataInfo(7085, None),
SocialSiteId.PIXIV_USER_ID: WikidataInfo(
5435,
306956,
id_version_of=SocialSiteId.PIXIV_USER_NICKNAME,
),
SocialSiteId.PIXIV_USER_NICKNAME: WikidataInfo(
None,
306956,
nickname_version_of=SocialSiteId.PIXIV_USER_ID,
),
SocialSiteId.MASTODON_PAGE: WikidataInfo(4033, None),
SocialSiteId.PATREON_PAGE: WikidataInfo(4175, 15861362),
SocialSiteId.ARTSTATION_PAGE: WikidataInfo(None, 65551500),
# SocialSiteId.INPRNT_PAGE: WikidataInfo(None, None),
SocialSiteId.CARRD_PAGE: WikidataInfo(None, 106036503),
SocialSiteId.YOUTUBE_CHANNEL_HANDLE: WikidataInfo(
11245,
866,
nickname_version_of=SocialSiteId.YOUTUBE_CHANNEL_ID,
),
SocialSiteId.YOUTUBE_CHANNEL_ID: WikidataInfo(
2397,
866,
id_version_of=SocialSiteId.YOUTUBE_CHANNEL_HANDLE,
),
SocialSiteId.VIMEO_CHANNEL: WikidataInfo(4015, 156376),
SocialSiteId.NEWGROUNDS_PAGE: WikidataInfo(None, 263655),
SocialSiteId.ARTSY_ARTIST: WikidataInfo(2042, 4796642),
SocialSiteId.ARTNET_ARTIST: WikidataInfo(3782, 266566),
SocialSiteId.DEVIANT_ART_ACCOUNT: WikidataInfo(7737, None),
SocialSiteId.DANBOORU_ARTIST: WikidataInfo(None, 64514853),
SocialSiteId.BANDCAMP_PROFILE: WikidataInfo(3283, 545966),
SocialSiteId.BLUESKY_PROFILE: WikidataInfo(12361, 78194383),
SocialSiteId.BLUESKY_DID: WikidataInfo(12409, 78194383),
SocialSiteId.THREADS_USERNAME: WikidataInfo(11892, 120281745),
SocialSiteId.ITCH_IO_DEVELOPER: WikidataInfo(8176, 22905933),
SocialSiteId.COHOST_PROFILE: WikidataInfo(None, 117203288),
SocialSiteId.SOUNDCLOUD_ARTIST: WikidataInfo(3040, None),
SocialSiteId.IGDB_GAME_ID: WikidataInfo(5794, None),
SocialSiteId.STEAM_APPLICATION_ID: WikidataInfo(1733, None),
SocialSiteId.GITHUB_REPOSITORY: WikidataInfo(None, 364),
SocialSiteId.LINKEDIN_PERSONAL_PROFILE: WikidataInfo(6634, None),
SocialSiteId.MEDIUM_BLOG: WikidataInfo(3899, None),
SocialSiteId.SUBSTACK: WikidataInfo(12007, None),
SocialSiteId.INPRNT_PAGE: WikidataInfo(None, None),
SocialSiteId.ETSY_SHOP: WikidataInfo(None, 1353939),
SocialSiteId.KO_FI: WikidataInfo(None, 77949925),
SocialSiteId.BEHANCE_PAGE: WikidataInfo(None, 4880667),
SocialSiteId.PLURK: WikidataInfo(None, 32111),
SocialSiteId.GOOGLE_BLOGGER_PAGE: WikidataInfo(None, 171186),
# Weird internal
SocialSiteId.LINK_COLLECTION_PAGE: WikidataInfo(None, None),
SocialSiteId.PAGE_WATCH: WikidataInfo(None, None),
SocialSiteId.JSON_LD: WikidataInfo(None, None),
SocialSiteId.FIREFOX_PROFILE_BOOKMARKS: WikidataInfo(None, None),
SocialSiteId.FALKON_PROFILE_BOOKMARKS: WikidataInfo(None, None),
}
def re_social_subdomain(main_domain: str) -> str:
return r'^(?:https?:\/\/)?([\w_-]+)\.' + re.escape(main_domain) + r'(\/.*)?$'
RE_ID = r'@?([^/]+)'
RE_DUAL_ID = r'@?([^/]+/[^/]+)'
RE_ANY_SUBPATH = r'(|\/|\/.*)$'
def re_social_path(main_domain: str) -> str:
return re_social_path_adv(main_domain, RE_ID)
def re_social_path_adv(main_domain: str, *path: str) -> str:
if main_domain.startswith('www.'):
msg = f'Redundant www: {main_domain}'
raise ValueError(msg)
regex_builder: list[str] = [
r'^',
r'(?:https?:\/\/)?',
r'(?:www\.)?',
re.escape(main_domain),
]
for p in path:
if p != RE_ANY_SUBPATH:
regex_builder.append(r'\/')
regex_builder.append(
p if p in {RE_ID, RE_DUAL_ID, RE_ANY_SUBPATH} else re.escape(p),
)
if path[-1] != RE_ANY_SUBPATH:
regex_builder.append(r'\/?$')
return ''.join(regex_builder)
MAILTO_URL = r'^mailto:(?:[\w._.]+@[\w._.]+)$'
REDDIT_SUBREDDIT_URL = r'^(?:https?:\/\/)?(?:old\.)?reddit\.com\/r\/([\w-]+)\/?$'
REDDIT_USER_URL = (
r'^(?:https?:\/\/)?(?:old\.)?reddit\.com\/user\/([\w-]+)(?:|\/submitted)\/?$'
)
TWITTER_HANDLE_URL_1 = re_social_path('twitter.com')
TWITTER_HANDLE_URL_2 = re_social_path('x.com')
LINKTREE_PAGE_URL = re_social_path('linktr.ee')
TWITCH_STREAM_URL = re_social_path('twitch.tv')
WIKIDATA_ITEM_URL = re_social_path_adv('wikidata.org', 'wiki', RE_ID)
SONGKICK_ARTIST_URL = (
r'^(?:https?:\/\/)?(?:www\.)?songkick\.com\/artists\/(\d+)([\w-]*)\/?$'
)
TUMBLR_PAGE_URL = re_social_path('tumblr.com')
TUMBLR_PAGE_URL_2 = re_social_subdomain('tumblr.com')
INSTAGRAM_URL = re_social_path('instagram.com')
PATREON_URL = re_social_path_adv('patreon.com', RE_ID, RE_ANY_SUBPATH)
ARTSTATION_URL = re_social_path('artstation.com')
INPRNT_URL = re_social_path_adv('inprnt.com', 'gallery', RE_ID)
FACEBOOK_PAGE_URL = re_social_path('facebook.com')
SUBSTACK_PREFIX_URL = re_social_subdomain('substack.com')
ETSY_SHOP_URL = re_social_path_adv('etsy.com', 'shop', RE_ID)
BEHANCE_PAGE_URL = re_social_path('behance.net')
TIKTOK_USER_URL = re_social_path('tiktok.com')
PIXIV_USER_ID_URL = r'^(?:https?:\/\/)?(?:www\.)?pixiv\.net(?:\/en)?\/users/(\d+)\/?$'
PIXIV_USER_ID_URL_2 = (
r'^(?:https?:\/\/)?(?:www\.)?pixiv\.net(?:\/en)?\/member\.php\/?[?]id=(\d+)$'
)
PIXIV_FANBOX_USER_NICKNAME_URL = re_social_subdomain('fanbox.cc')
PIXIV_USER_NICKNAME_URL = re_social_path_adv('pixiv.net', 'stacc', RE_ID)
PIXIV_SKETCH_USER_NICKNAME_URL = re_social_path_adv('sketch.pixiv.net', RE_ID)
URL_PARSE_CARRD_PAGE = re_social_subdomain('carrd.co')
URL_PARSE_YOUTUBE_CHANNEL_HANDLE_1 = re_social_path_adv('youtube.com', RE_ID)
URL_PARSE_YOUTUBE_CHANNEL_HANDLE_2 = re_social_path_adv('youtube.com', 'c', RE_ID)
URL_PARSE_YOUTUBE_CHANNEL_ID = re_social_path_adv('youtube.com', 'channel', RE_ID)
URL_PARSE_VIMEO_CHANNEL = re_social_path_adv('vimeo.com', RE_ID)
URL_PARSE_NEWGROUNDS_PAGE = re_social_subdomain('newgrounds.com')
URL_PARSE_ARTSY_ARTIST = re_social_path_adv(
'artsy.net',
'artist',
RE_ID,
RE_ANY_SUBPATH,
)
URL_PARSE_ARTNET_ARTIST = re_social_path_adv(
'artnet.com',
'artists',
RE_ID,
RE_ANY_SUBPATH,
)
URL_PARSE_DEVIANT_ART_ACCOUNT = re_social_path_adv('deviantart.com', RE_ID)
URL_PARSE_DEVIANT_ART_ACCOUNT_2 = re_social_subdomain('deviantart.com')
URL_PARSE_DANBOORU_ARTIST = re_social_path_adv('danbooru.donmai.us', 'artists', RE_ID)
URL_PARSE_BANDCAMP = re_social_subdomain('bandcamp.com')
URL_PARSE_BLUESKY = re_social_path_adv('bsky.app', 'profile', RE_ID)
REGEXES: list[tuple[str, SocialSiteId]] = [
# Reddit
(REDDIT_SUBREDDIT_URL, SocialSiteId.REDDIT_SUBREDDIT),
(REDDIT_USER_URL, SocialSiteId.REDDIT_USER),
# Twitter
(TWITTER_HANDLE_URL_1, SocialSiteId.TWITTER),
(TWITTER_HANDLE_URL_2, SocialSiteId.TWITTER),
# Facebook
(FACEBOOK_PAGE_URL, SocialSiteId.FACEBOOK_PAGE),
# Linktr.ee
(LINKTREE_PAGE_URL, SocialSiteId.LINKTREE_PAGE),
# Twitch.tv
(TWITCH_STREAM_URL, SocialSiteId.TWITCH),
# Wikidata
(WIKIDATA_ITEM_URL, SocialSiteId.WIKIDATA),
# Songkick
(SONGKICK_ARTIST_URL, SocialSiteId.SONGKICK_ARTIST),
# Tumblr
(TUMBLR_PAGE_URL, SocialSiteId.TUMBLR),
(TUMBLR_PAGE_URL_2, SocialSiteId.TUMBLR),
# Instagram
(INSTAGRAM_URL, SocialSiteId.INSTAGRAM_PAGE),
# Tiktok
(TIKTOK_USER_URL, SocialSiteId.TIKTOK_USER),
# Pixiv
(PIXIV_USER_ID_URL, SocialSiteId.PIXIV_USER_ID),
(PIXIV_USER_ID_URL_2, SocialSiteId.PIXIV_USER_ID),
(PIXIV_FANBOX_USER_NICKNAME_URL, SocialSiteId.PIXIV_USER_NICKNAME),
(PIXIV_USER_NICKNAME_URL, SocialSiteId.PIXIV_USER_NICKNAME),
(PIXIV_SKETCH_USER_NICKNAME_URL, SocialSiteId.PIXIV_USER_NICKNAME),
# Patreon
(PATREON_URL, SocialSiteId.PATREON_PAGE),
# Artstation
(ARTSTATION_URL, SocialSiteId.ARTSTATION_PAGE),
# Inprnt
(INPRNT_URL, SocialSiteId.INPRNT_PAGE),
# Email
(MAILTO_URL, SocialSiteId.EMAIL),
# Substack
(SUBSTACK_PREFIX_URL, SocialSiteId.SUBSTACK),
# Medium
(re_social_path_adv('medium.com', RE_ID), SocialSiteId.MEDIUM_BLOG),
(re_social_subdomain('medium.com'), SocialSiteId.MEDIUM_BLOG),
# Etsy shop
(ETSY_SHOP_URL, SocialSiteId.ETSY_SHOP),
# Ko-fi
(re_social_path_adv('ko-fi.com', RE_ID), SocialSiteId.KO_FI),
(re_social_path_adv('ko-fi.com', RE_ID, 'shop'), SocialSiteId.KO_FI),
# Behance
(BEHANCE_PAGE_URL, SocialSiteId.BEHANCE_PAGE),
# Carrd
(URL_PARSE_CARRD_PAGE, SocialSiteId.CARRD_PAGE),
# Youtube
(URL_PARSE_YOUTUBE_CHANNEL_HANDLE_1, SocialSiteId.YOUTUBE_CHANNEL_HANDLE),
(URL_PARSE_YOUTUBE_CHANNEL_HANDLE_2, SocialSiteId.YOUTUBE_CHANNEL_HANDLE),
(URL_PARSE_YOUTUBE_CHANNEL_ID, SocialSiteId.YOUTUBE_CHANNEL_ID),
# Vimeo
(URL_PARSE_VIMEO_CHANNEL, SocialSiteId.VIMEO_CHANNEL),
# Newgrounds
(URL_PARSE_NEWGROUNDS_PAGE, SocialSiteId.NEWGROUNDS_PAGE),
# Artsy
(URL_PARSE_ARTSY_ARTIST, SocialSiteId.ARTSY_ARTIST),
(URL_PARSE_ARTNET_ARTIST, SocialSiteId.ARTNET_ARTIST),
# Deviant art
(URL_PARSE_DEVIANT_ART_ACCOUNT, SocialSiteId.DEVIANT_ART_ACCOUNT),
(URL_PARSE_DEVIANT_ART_ACCOUNT_2, SocialSiteId.DEVIANT_ART_ACCOUNT),
# Danbooru
(URL_PARSE_DANBOORU_ARTIST, SocialSiteId.DANBOORU_ARTIST),
# Bandcamp
(URL_PARSE_BANDCAMP, SocialSiteId.BANDCAMP_PROFILE),
# Bluesky
(URL_PARSE_BLUESKY, SocialSiteId.BLUESKY_PROFILE),
# Threads
(re_social_path_adv('threads.net', RE_ID), SocialSiteId.THREADS_USERNAME),
# Itch.io
(re_social_subdomain('itch.io'), SocialSiteId.ITCH_IO_DEVELOPER),
# Cohost
(re_social_path_adv('cohost.org', RE_ID), SocialSiteId.COHOST_PROFILE),
# Soundcloud
(re_social_path_adv('soundcloud.com', RE_ID), SocialSiteId.SOUNDCLOUD_ARTIST),
# IGDB
(re_social_path_adv('igdb.com', 'games', RE_ID), SocialSiteId.IGDB_GAME_ID),
# Steam game
(
re_social_path_adv('store.steampowered.com', 'app', RE_ID, RE_ANY_SUBPATH),
SocialSiteId.STEAM_APPLICATION_ID,
),
# Github
(re_social_path_adv('github.com', RE_DUAL_ID), SocialSiteId.GITHUB_REPOSITORY),
# Plurk
(re_social_path_adv('plurk.com', RE_ID), SocialSiteId.PLURK),
# Linked in
(
re_social_path_adv('linkedin.com', 'in', RE_ID),
SocialSiteId.LINKEDIN_PERSONAL_PROFILE,
),
# Google Blogger
(re_social_subdomain('blogspot.com'), SocialSiteId.GOOGLE_BLOGGER_PAGE),
]
WELL_KNOWN_MASTODON_INSTANCES: frozenset[str] = frozenset(
{
# Includes all servers with 50 000+ users as of 6 july 2023.
# based on https://mastodonservers.net/servers/top
'mastodon.social',
#'pawoo.net',
'baraag.net',
'mstdn.jp',
'mastodon.cloud',
'mstdn.social',
'mastodon.online',
'mas.to',
'mastodon.world',
'mastodon.lol',
'mastodon.sdf.org',
'c.im',
'mastodon.uno',
'mastodonapp.uk',
'fosstodon.org',
'idlethumbs.social',
},
)
def determine_social_from_url_internally(
url: str,
) -> tuple[SocialSiteId | None, str | None]:
assert isinstance(url, str)
# Regexes
for social_site_url_regex, social_site_id in REGEXES:
if m := re.fullmatch(social_site_url_regex, url, re.I):
groups = m.groups()
return (social_site_id, groups[0] if len(groups) > 0 else None)
# Mastodon
for mastodon_hostname in WELL_KNOWN_MASTODON_INSTANCES:
if url.startswith('https://' + mastodon_hostname):
return (SocialSiteId.MASTODON_PAGE, None)
if 'mastodon' in url:
return (SocialSiteId.MASTODON_PAGE, None)
# Feed (?)
if 'feed' in url or 'xml' in url or 'rss' in url or 'atom' in url:
return (SocialSiteId.RSS_FEED, None)
return (None, None)
def to_parse_result(url: str | urllib.parse.ParseResult) -> urllib.parse.ParseResult:
if isinstance(url, str):
return urllib.parse.urlparse(url)
if isinstance(url, urllib.parse.ParseResult):
return url
# Throw error
msg = f'Expected {urllib.parse.ParseResult} or {str}'
raise TypeError(msg)
def determine_social_from_url(
url_not_normalized: str | urllib.parse.ParseResult,
) -> SocialLink | None:
url = to_parse_result(url_not_normalized)
(social_site_id, social_id) = determine_social_from_url_internally(
url._replace(query='', fragment='').geturl(),
)
if not social_site_id:
(social_site_id, social_id) = determine_social_from_url_internally(
url._replace(fragment='').geturl(),
)
if not social_site_id:
return None
return SocialLink(url, social_site_id, social_id)