1
0
This commit is contained in:
Jon Michael Aanes 2024-05-12 16:35:17 +02:00
parent 1aa41a8414
commit cc4b666011
Signed by: Jmaa
SSH Key Fingerprint: SHA256:Ab0GfHGCblESJx7JRE4fj4bFy/KRpeLhi41y4pF3sNA
4 changed files with 59 additions and 33 deletions

View File

@ -1,22 +1,19 @@
""" """Small utility for detecting social websites."""
Small utility for detecting social websites.
"""
from dataclasses import dataclass
from enforce_typing import enforce_types
from typing import List, Set, Optional, Union
import aenum
import datetime import datetime
import re import re
import urllib.parse import urllib.parse
from dataclasses import dataclass
from typing import List, Optional, Set, Union
import aenum
from enforce_typing import enforce_types
from socials_util._version import __version__ from socials_util._version import __version__
class SocialSiteId(aenum.Enum): class SocialSiteId(aenum.Enum):
""" """The great social website enum."""
The great social website enum.
"""
# Reddit-like # Reddit-like
REDDIT = 1 # Should have been named REDDIT_SUBREDDIT REDDIT = 1 # Should have been named REDDIT_SUBREDDIT
@ -103,19 +100,19 @@ AGGERAGOR_SOCIALS = {
@enforce_types @enforce_types
@dataclass(frozen=True) @dataclass(frozen=True)
class SocialLink(object): class SocialLink:
url: urllib.parse.ParseResult url: urllib.parse.ParseResult
social_site_id: SocialSiteId social_site_id: SocialSiteId
social_id: Optional[str] social_id: str | None
@enforce_types @enforce_types
@dataclass(frozen=True) @dataclass(frozen=True)
class WikidataInfo(object): class WikidataInfo:
property_id: Optional[int] property_id: int | None
issuer_id: Optional[int] issuer_id: int | None
id_version_of: Optional[SocialSiteId] = None id_version_of: SocialSiteId | None = None
nickname_version_of: Optional[SocialSiteId] = None nickname_version_of: SocialSiteId | None = None
WIKIDATA_PROPERTIES: dict[SocialSiteId, WikidataInfo] = { WIKIDATA_PROPERTIES: dict[SocialSiteId, WikidataInfo] = {
@ -134,10 +131,14 @@ WIKIDATA_PROPERTIES: dict[SocialSiteId, WikidataInfo] = {
SocialSiteId.TUMBLR: WikidataInfo(3943, None), SocialSiteId.TUMBLR: WikidataInfo(3943, None),
SocialSiteId.TIKTOK_USER: WikidataInfo(7085, None), SocialSiteId.TIKTOK_USER: WikidataInfo(7085, None),
SocialSiteId.PIXIV_USER_ID: WikidataInfo( SocialSiteId.PIXIV_USER_ID: WikidataInfo(
5435, 306956, id_version_of=SocialSiteId.PIXIV_USER_NICKNAME 5435,
306956,
id_version_of=SocialSiteId.PIXIV_USER_NICKNAME,
), ),
SocialSiteId.PIXIV_USER_NICKNAME: WikidataInfo( SocialSiteId.PIXIV_USER_NICKNAME: WikidataInfo(
None, 306956, nickname_version_of=SocialSiteId.PIXIV_USER_ID None,
306956,
nickname_version_of=SocialSiteId.PIXIV_USER_ID,
), ),
SocialSiteId.MASTODON_PAGE: WikidataInfo(4033, None), SocialSiteId.MASTODON_PAGE: WikidataInfo(4033, None),
SocialSiteId.PATREON_PAGE: WikidataInfo(4175, 15861362), SocialSiteId.PATREON_PAGE: WikidataInfo(4175, 15861362),
@ -145,10 +146,14 @@ WIKIDATA_PROPERTIES: dict[SocialSiteId, WikidataInfo] = {
# SocialSiteId.INPRNT_PAGE: WikidataInfo(None, None), # SocialSiteId.INPRNT_PAGE: WikidataInfo(None, None),
SocialSiteId.CARRD_PAGE: WikidataInfo(None, 106036503), SocialSiteId.CARRD_PAGE: WikidataInfo(None, 106036503),
SocialSiteId.YOUTUBE_CHANNEL_HANDLE: WikidataInfo( SocialSiteId.YOUTUBE_CHANNEL_HANDLE: WikidataInfo(
11245, 866, nickname_version_of=SocialSiteId.YOUTUBE_CHANNEL_ID 11245,
866,
nickname_version_of=SocialSiteId.YOUTUBE_CHANNEL_ID,
), ),
SocialSiteId.YOUTUBE_CHANNEL_ID: WikidataInfo( SocialSiteId.YOUTUBE_CHANNEL_ID: WikidataInfo(
2397, 866, id_version_of=SocialSiteId.YOUTUBE_CHANNEL_HANDLE 2397,
866,
id_version_of=SocialSiteId.YOUTUBE_CHANNEL_HANDLE,
), ),
SocialSiteId.VIMEO_CHANNEL: WikidataInfo(4015, 156376), SocialSiteId.VIMEO_CHANNEL: WikidataInfo(4015, 156376),
SocialSiteId.NEWGROUNDS_PAGE: WikidataInfo(None, 263655), SocialSiteId.NEWGROUNDS_PAGE: WikidataInfo(None, 263655),
@ -201,12 +206,19 @@ def re_social_path_adv(main_domain: str, *path: str) -> str:
if main_domain.startswith('www.'): if main_domain.startswith('www.'):
msg = f'Redundant www: {main_domain}' msg = f'Redundant www: {main_domain}'
raise ValueError(msg) raise ValueError(msg)
regex_builder: list[str] = [r'^', r'(?:https?:\/\/)?', r'(?:www\.)?', re.escape(main_domain)] regex_builder: list[str] = [
r'^',
r'(?:https?:\/\/)?',
r'(?:www\.)?',
re.escape(main_domain),
]
for p in path: for p in path:
if p != RE_ANY_SUBPATH: if p != RE_ANY_SUBPATH:
regex_builder.append(r'\/') regex_builder.append(r'\/')
regex_builder.append(p if p in {RE_ID, RE_DUAL_ID, RE_ANY_SUBPATH} else re.escape(p)) regex_builder.append(
p if p in {RE_ID, RE_DUAL_ID, RE_ANY_SUBPATH} else re.escape(p),
)
if path[-1] != RE_ANY_SUBPATH: if path[-1] != RE_ANY_SUBPATH:
regex_builder.append(r'\/?$') regex_builder.append(r'\/?$')
return ''.join(regex_builder) return ''.join(regex_builder)
@ -252,10 +264,16 @@ URL_PARSE_YOUTUBE_CHANNEL_ID = re_social_path_adv('youtube.com', 'channel', RE_I
URL_PARSE_VIMEO_CHANNEL = re_social_path_adv('vimeo.com', RE_ID) URL_PARSE_VIMEO_CHANNEL = re_social_path_adv('vimeo.com', RE_ID)
URL_PARSE_NEWGROUNDS_PAGE = re_social_subdomain('newgrounds.com') URL_PARSE_NEWGROUNDS_PAGE = re_social_subdomain('newgrounds.com')
URL_PARSE_ARTSY_ARTIST = re_social_path_adv( URL_PARSE_ARTSY_ARTIST = re_social_path_adv(
'artsy.net', 'artist', RE_ID, RE_ANY_SUBPATH 'artsy.net',
'artist',
RE_ID,
RE_ANY_SUBPATH,
) )
URL_PARSE_ARTNET_ARTIST = re_social_path_adv( URL_PARSE_ARTNET_ARTIST = re_social_path_adv(
'artnet.com', 'artists', RE_ID, RE_ANY_SUBPATH 'artnet.com',
'artists',
RE_ID,
RE_ANY_SUBPATH,
) )
URL_PARSE_DEVIANT_ART_ACCOUNT = re_social_path_adv('deviantart.com', RE_ID) URL_PARSE_DEVIANT_ART_ACCOUNT = re_social_path_adv('deviantart.com', RE_ID)
URL_PARSE_DEVIANT_ART_ACCOUNT_2 = re_social_subdomain('deviantart.com') URL_PARSE_DEVIANT_ART_ACCOUNT_2 = re_social_subdomain('deviantart.com')
@ -383,11 +401,13 @@ WELL_KNOWN_MASTODON_INSTANCES: frozenset[str] = frozenset(
'mastodonapp.uk', 'mastodonapp.uk',
'fosstodon.org', 'fosstodon.org',
'idlethumbs.social', 'idlethumbs.social',
} },
) )
def determine_social_from_url_internally(url: str) -> tuple[SocialSiteId | None, str | None]: def determine_social_from_url_internally(
url: str,
) -> tuple[SocialSiteId | None, str | None]:
assert isinstance(url, str) assert isinstance(url, str)
# Regexes # Regexes
@ -409,6 +429,7 @@ def determine_social_from_url_internally(url: str) -> tuple[SocialSiteId | None,
return (None, None) return (None, None)
def to_parse_result(url: str | urllib.parse.ParseResult) -> urllib.parse.ParseResult: def to_parse_result(url: str | urllib.parse.ParseResult) -> urllib.parse.ParseResult:
if isinstance(url, str): if isinstance(url, str):
return urllib.parse.urlparse(url) return urllib.parse.urlparse(url)
@ -419,14 +440,17 @@ def to_parse_result(url: str | urllib.parse.ParseResult) -> urllib.parse.ParseRe
msg = f'Expected {urllib.parse.ParseResult} or {str}' msg = f'Expected {urllib.parse.ParseResult} or {str}'
raise TypeError(msg) raise TypeError(msg)
def determine_social_from_url(url_not_normalized: str | urllib.parse.ParseResult) -> SocialLink | None:
def determine_social_from_url(
url_not_normalized: str | urllib.parse.ParseResult,
) -> SocialLink | None:
url = to_parse_result(url_not_normalized) url = to_parse_result(url_not_normalized)
(social_site_id, social_id) = determine_social_from_url_internally( (social_site_id, social_id) = determine_social_from_url_internally(
url._replace(query='', fragment='').geturl() url._replace(query='', fragment='').geturl(),
) )
if not social_site_id: if not social_site_id:
(social_site_id, social_id) = determine_social_from_url_internally( (social_site_id, social_id) = determine_social_from_url_internally(
url._replace(fragment='').geturl() url._replace(fragment='').geturl(),
) )
if not social_site_id: if not social_site_id:

View File

@ -1,6 +1,7 @@
import socials_util
import pytest import pytest
import socials_util
@pytest.mark.parametrize('social_site_id', list(socials_util.SocialSiteId)) @pytest.mark.parametrize('social_site_id', list(socials_util.SocialSiteId))
def test_consistency(social_site_id): def test_consistency(social_site_id):

View File

@ -1,6 +1,7 @@
from socials_util import *
import aenum import aenum
from socials_util import *
def test_extension(): def test_extension():
MY_SECRET_SITE = aenum.extend_enum(SocialSiteId, 'MY_SECRET_SITE', 666) MY_SECRET_SITE = aenum.extend_enum(SocialSiteId, 'MY_SECRET_SITE', 666)

View File

@ -1,6 +1,6 @@
from socials_util import *
import pytest import pytest
from socials_util import *
PARSABLE_SOCIAL_IDS = [ PARSABLE_SOCIAL_IDS = [
('http://www.twitter.com/dril', 'dril'), ('http://www.twitter.com/dril', 'dril'),