1
0

Compare commits

..

3 Commits

Author SHA1 Message Date
6e6e5e63be
Ruff checks
All checks were successful
Python Package / Package (push) Successful in 20s
2024-05-12 16:38:25 +02:00
cc4b666011
Ruff 2024-05-12 16:35:29 +02:00
1aa41a8414
More typing 2024-05-12 16:34:47 +02:00
5 changed files with 84 additions and 51 deletions

View File

@ -1,22 +1,17 @@
""" """Small utility for detecting social websites."""
Small utility for detecting social websites.
"""
from dataclasses import dataclass
from enforce_typing import enforce_types
from typing import List, Set, Optional, Union
import aenum
import datetime
import re import re
import urllib.parse import urllib.parse
from dataclasses import dataclass
from socials_util._version import __version__ import aenum
from enforce_typing import enforce_types
from socials_util._version import __version__ # noqa: F401
class SocialSiteId(aenum.Enum): class SocialSiteId(aenum.Enum):
""" """The great social website enum."""
The great social website enum.
"""
# Reddit-like # Reddit-like
REDDIT = 1 # Should have been named REDDIT_SUBREDDIT REDDIT = 1 # Should have been named REDDIT_SUBREDDIT
@ -103,22 +98,22 @@ AGGERAGOR_SOCIALS = {
@enforce_types @enforce_types
@dataclass(frozen=True) @dataclass(frozen=True)
class SocialLink(object): class SocialLink:
url: urllib.parse.ParseResult url: urllib.parse.ParseResult
social_site_id: SocialSiteId social_site_id: SocialSiteId
social_id: Optional[str] social_id: str | None
@enforce_types @enforce_types
@dataclass(frozen=True) @dataclass(frozen=True)
class WikidataInfo(object): class WikidataInfo:
property_id: Optional[int] property_id: int | None
issuer_id: Optional[int] issuer_id: int | None
id_version_of: Optional[SocialSiteId] = None id_version_of: SocialSiteId | None = None
nickname_version_of: Optional[SocialSiteId] = None nickname_version_of: SocialSiteId | None = None
WIKIDATA_PROPERTIES = { WIKIDATA_PROPERTIES: dict[SocialSiteId, WikidataInfo] = {
SocialSiteId.EMAIL: WikidataInfo(968, None), SocialSiteId.EMAIL: WikidataInfo(968, None),
SocialSiteId.RSS_FEED: WikidataInfo(1079, None), SocialSiteId.RSS_FEED: WikidataInfo(1079, None),
SocialSiteId.FACEBOOK_PAGE: WikidataInfo(2013, None), SocialSiteId.FACEBOOK_PAGE: WikidataInfo(2013, None),
@ -134,10 +129,14 @@ WIKIDATA_PROPERTIES = {
SocialSiteId.TUMBLR: WikidataInfo(3943, None), SocialSiteId.TUMBLR: WikidataInfo(3943, None),
SocialSiteId.TIKTOK_USER: WikidataInfo(7085, None), SocialSiteId.TIKTOK_USER: WikidataInfo(7085, None),
SocialSiteId.PIXIV_USER_ID: WikidataInfo( SocialSiteId.PIXIV_USER_ID: WikidataInfo(
5435, 306956, id_version_of=SocialSiteId.PIXIV_USER_NICKNAME 5435,
306956,
id_version_of=SocialSiteId.PIXIV_USER_NICKNAME,
), ),
SocialSiteId.PIXIV_USER_NICKNAME: WikidataInfo( SocialSiteId.PIXIV_USER_NICKNAME: WikidataInfo(
None, 306956, nickname_version_of=SocialSiteId.PIXIV_USER_ID None,
306956,
nickname_version_of=SocialSiteId.PIXIV_USER_ID,
), ),
SocialSiteId.MASTODON_PAGE: WikidataInfo(4033, None), SocialSiteId.MASTODON_PAGE: WikidataInfo(4033, None),
SocialSiteId.PATREON_PAGE: WikidataInfo(4175, 15861362), SocialSiteId.PATREON_PAGE: WikidataInfo(4175, 15861362),
@ -145,10 +144,14 @@ WIKIDATA_PROPERTIES = {
# SocialSiteId.INPRNT_PAGE: WikidataInfo(None, None), # SocialSiteId.INPRNT_PAGE: WikidataInfo(None, None),
SocialSiteId.CARRD_PAGE: WikidataInfo(None, 106036503), SocialSiteId.CARRD_PAGE: WikidataInfo(None, 106036503),
SocialSiteId.YOUTUBE_CHANNEL_HANDLE: WikidataInfo( SocialSiteId.YOUTUBE_CHANNEL_HANDLE: WikidataInfo(
11245, 866, nickname_version_of=SocialSiteId.YOUTUBE_CHANNEL_ID 11245,
866,
nickname_version_of=SocialSiteId.YOUTUBE_CHANNEL_ID,
), ),
SocialSiteId.YOUTUBE_CHANNEL_ID: WikidataInfo( SocialSiteId.YOUTUBE_CHANNEL_ID: WikidataInfo(
2397, 866, id_version_of=SocialSiteId.YOUTUBE_CHANNEL_HANDLE 2397,
866,
id_version_of=SocialSiteId.YOUTUBE_CHANNEL_HANDLE,
), ),
SocialSiteId.VIMEO_CHANNEL: WikidataInfo(4015, 156376), SocialSiteId.VIMEO_CHANNEL: WikidataInfo(4015, 156376),
SocialSiteId.NEWGROUNDS_PAGE: WikidataInfo(None, 263655), SocialSiteId.NEWGROUNDS_PAGE: WikidataInfo(None, 263655),
@ -184,8 +187,7 @@ WIKIDATA_PROPERTIES = {
} }
def re_social_subdomain(main_domain): def re_social_subdomain(main_domain: str) -> str:
# return r'^(?:https?:\/\/)?([\w_-]+)\.'+re.escape(main_domain)+'\/?$'
return r'^(?:https?:\/\/)?([\w_-]+)\.' + re.escape(main_domain) + r'(\/.*)?$' return r'^(?:https?:\/\/)?([\w_-]+)\.' + re.escape(main_domain) + r'(\/.*)?$'
@ -194,23 +196,30 @@ RE_DUAL_ID = r'@?([^/]+/[^/]+)'
RE_ANY_SUBPATH = r'(|\/|\/.*)$' RE_ANY_SUBPATH = r'(|\/|\/.*)$'
def re_social_path(main_domain): def re_social_path(main_domain: str) -> str:
# return r'^(?:https?:\/\/)?(?:www\.)?'+re.escape(main_domain)+'\/'+RE_ID+'\/?$'
return re_social_path_adv(main_domain, RE_ID) return re_social_path_adv(main_domain, RE_ID)
def re_social_path_adv(main_domain, *path): def re_social_path_adv(main_domain: str, *path: str) -> str:
assert not main_domain.startswith('www.'), 'Redundant www.' if main_domain.startswith('www.'):
l = [r'^', r'(?:https?:\/\/)?', r'(?:www\.)?', re.escape(main_domain)] msg = f'Redundant www: {main_domain}'
raise ValueError(msg)
regex_builder: list[str] = [
r'^',
r'(?:https?:\/\/)?',
r'(?:www\.)?',
re.escape(main_domain),
]
for p in path: for p in path:
if p != RE_ANY_SUBPATH: if p != RE_ANY_SUBPATH:
l.append(r'\/') regex_builder.append(r'\/')
l.append(p if p in {RE_ID, RE_DUAL_ID, RE_ANY_SUBPATH} else re.escape(p)) regex_builder.append(
p if p in {RE_ID, RE_DUAL_ID, RE_ANY_SUBPATH} else re.escape(p),
)
if path[-1] != RE_ANY_SUBPATH: if path[-1] != RE_ANY_SUBPATH:
l.append(r'\/?$') regex_builder.append(r'\/?$')
regex = ''.join(l) return ''.join(regex_builder)
return regex
MAILTO_URL = r'^mailto:(?:[\w._.]+@[\w._.]+)$' MAILTO_URL = r'^mailto:(?:[\w._.]+@[\w._.]+)$'
@ -253,10 +262,16 @@ URL_PARSE_YOUTUBE_CHANNEL_ID = re_social_path_adv('youtube.com', 'channel', RE_I
URL_PARSE_VIMEO_CHANNEL = re_social_path_adv('vimeo.com', RE_ID) URL_PARSE_VIMEO_CHANNEL = re_social_path_adv('vimeo.com', RE_ID)
URL_PARSE_NEWGROUNDS_PAGE = re_social_subdomain('newgrounds.com') URL_PARSE_NEWGROUNDS_PAGE = re_social_subdomain('newgrounds.com')
URL_PARSE_ARTSY_ARTIST = re_social_path_adv( URL_PARSE_ARTSY_ARTIST = re_social_path_adv(
'artsy.net', 'artist', RE_ID, RE_ANY_SUBPATH 'artsy.net',
'artist',
RE_ID,
RE_ANY_SUBPATH,
) )
URL_PARSE_ARTNET_ARTIST = re_social_path_adv( URL_PARSE_ARTNET_ARTIST = re_social_path_adv(
'artnet.com', 'artists', RE_ID, RE_ANY_SUBPATH 'artnet.com',
'artists',
RE_ID,
RE_ANY_SUBPATH,
) )
URL_PARSE_DEVIANT_ART_ACCOUNT = re_social_path_adv('deviantart.com', RE_ID) URL_PARSE_DEVIANT_ART_ACCOUNT = re_social_path_adv('deviantart.com', RE_ID)
URL_PARSE_DEVIANT_ART_ACCOUNT_2 = re_social_subdomain('deviantart.com') URL_PARSE_DEVIANT_ART_ACCOUNT_2 = re_social_subdomain('deviantart.com')
@ -264,7 +279,7 @@ URL_PARSE_DANBOORU_ARTIST = re_social_path_adv('danbooru.donmai.us', 'artists',
URL_PARSE_BANDCAMP = re_social_subdomain('bandcamp.com') URL_PARSE_BANDCAMP = re_social_subdomain('bandcamp.com')
URL_PARSE_BLUESKY = re_social_path_adv('bsky.app', 'profile', RE_ID) URL_PARSE_BLUESKY = re_social_path_adv('bsky.app', 'profile', RE_ID)
REGEXES = [ REGEXES: list[tuple[str, SocialSiteId]] = [
# Reddit # Reddit
(REDDIT_SUBREDDIT_URL, SocialSiteId.REDDIT_SUBREDDIT), (REDDIT_SUBREDDIT_URL, SocialSiteId.REDDIT_SUBREDDIT),
(REDDIT_USER_URL, SocialSiteId.REDDIT_USER), (REDDIT_USER_URL, SocialSiteId.REDDIT_USER),
@ -364,7 +379,7 @@ REGEXES = [
(re_social_subdomain('blogspot.com'), SocialSiteId.GOOGLE_BLOGGER_PAGE), (re_social_subdomain('blogspot.com'), SocialSiteId.GOOGLE_BLOGGER_PAGE),
] ]
WELL_KNOWN_MASTODON_INSTANCES = frozenset( WELL_KNOWN_MASTODON_INSTANCES: frozenset[str] = frozenset(
{ {
# Includes all servers with 50 000+ users as of 6 july 2023. # Includes all servers with 50 000+ users as of 6 july 2023.
# based on https://mastodonservers.net/servers/top # based on https://mastodonservers.net/servers/top
@ -384,12 +399,16 @@ WELL_KNOWN_MASTODON_INSTANCES = frozenset(
'mastodonapp.uk', 'mastodonapp.uk',
'fosstodon.org', 'fosstodon.org',
'idlethumbs.social', 'idlethumbs.social',
} },
) )
def determine_social_from_url_internally(url: str): def determine_social_from_url_internally(
assert isinstance(url, str) url: str,
) -> tuple[SocialSiteId | None, str | None]:
if not isinstance(url, str):
msg = f'Url must be {str}'
raise TypeError(msg)
# Regexes # Regexes
for social_site_url_regex, social_site_id in REGEXES: for social_site_url_regex, social_site_id in REGEXES:
@ -405,21 +424,33 @@ def determine_social_from_url_internally(url: str):
return (SocialSiteId.MASTODON_PAGE, None) return (SocialSiteId.MASTODON_PAGE, None)
# Feed (?) # Feed (?)
elif 'feed' in url or 'xml' in url or 'rss' in url or 'atom' in url: if 'feed' in url or 'xml' in url or 'rss' in url or 'atom' in url:
return (SocialSiteId.RSS_FEED, None) return (SocialSiteId.RSS_FEED, None)
return (None, None) return (None, None)
def determine_social_from_url(url): def to_parse_result(url: str | urllib.parse.ParseResult) -> urllib.parse.ParseResult:
if isinstance(url, str): if isinstance(url, str):
url = urllib.parse.urlparse(url) return urllib.parse.urlparse(url)
if isinstance(url, urllib.parse.ParseResult):
return url
# Throw error
msg = f'Expected {urllib.parse.ParseResult} or {str}'
raise TypeError(msg)
def determine_social_from_url(
url_not_normalized: str | urllib.parse.ParseResult,
) -> SocialLink | None:
url = to_parse_result(url_not_normalized)
(social_site_id, social_id) = determine_social_from_url_internally( (social_site_id, social_id) = determine_social_from_url_internally(
url._replace(query='', fragment='').geturl() url._replace(query='', fragment='').geturl(),
) )
if not social_site_id: if not social_site_id:
(social_site_id, social_id) = determine_social_from_url_internally( (social_site_id, social_id) = determine_social_from_url_internally(
url._replace(fragment='').geturl() url._replace(fragment='').geturl(),
) )
if not social_site_id: if not social_site_id:

0
test/__init__.py Normal file
View File

View File

@ -1,6 +1,7 @@
import socials_util
import pytest import pytest
import socials_util
@pytest.mark.parametrize('social_site_id', list(socials_util.SocialSiteId)) @pytest.mark.parametrize('social_site_id', list(socials_util.SocialSiteId))
def test_consistency(social_site_id): def test_consistency(social_site_id):

View File

@ -1,6 +1,7 @@
from socials_util import *
import aenum import aenum
from socials_util import *
def test_extension(): def test_extension():
MY_SECRET_SITE = aenum.extend_enum(SocialSiteId, 'MY_SECRET_SITE', 666) MY_SECRET_SITE = aenum.extend_enum(SocialSiteId, 'MY_SECRET_SITE', 666)

View File

@ -1,6 +1,6 @@
from socials_util import *
import pytest import pytest
from socials_util import *
PARSABLE_SOCIAL_IDS = [ PARSABLE_SOCIAL_IDS = [
('http://www.twitter.com/dril', 'dril'), ('http://www.twitter.com/dril', 'dril'),