1
0
This commit is contained in:
Jon Michael Aanes 2024-05-12 16:35:17 +02:00
parent 1aa41a8414
commit cc4b666011
Signed by: Jmaa
SSH Key Fingerprint: SHA256:Ab0GfHGCblESJx7JRE4fj4bFy/KRpeLhi41y4pF3sNA
4 changed files with 59 additions and 33 deletions

View File

@ -1,22 +1,19 @@
"""
Small utility for detecting social websites.
"""
"""Small utility for detecting social websites."""
from dataclasses import dataclass
from enforce_typing import enforce_types
from typing import List, Set, Optional, Union
import aenum
import datetime
import re
import urllib.parse
from dataclasses import dataclass
from typing import List, Optional, Set, Union
import aenum
from enforce_typing import enforce_types
from socials_util._version import __version__
class SocialSiteId(aenum.Enum):
"""
The great social website enum.
"""
"""The great social website enum."""
# Reddit-like
REDDIT = 1 # Should have been named REDDIT_SUBREDDIT
@ -103,19 +100,19 @@ AGGERAGOR_SOCIALS = {
@enforce_types
@dataclass(frozen=True)
class SocialLink(object):
class SocialLink:
url: urllib.parse.ParseResult
social_site_id: SocialSiteId
social_id: Optional[str]
social_id: str | None
@enforce_types
@dataclass(frozen=True)
class WikidataInfo(object):
property_id: Optional[int]
issuer_id: Optional[int]
id_version_of: Optional[SocialSiteId] = None
nickname_version_of: Optional[SocialSiteId] = None
class WikidataInfo:
property_id: int | None
issuer_id: int | None
id_version_of: SocialSiteId | None = None
nickname_version_of: SocialSiteId | None = None
WIKIDATA_PROPERTIES: dict[SocialSiteId, WikidataInfo] = {
@ -134,10 +131,14 @@ WIKIDATA_PROPERTIES: dict[SocialSiteId, WikidataInfo] = {
SocialSiteId.TUMBLR: WikidataInfo(3943, None),
SocialSiteId.TIKTOK_USER: WikidataInfo(7085, None),
SocialSiteId.PIXIV_USER_ID: WikidataInfo(
5435, 306956, id_version_of=SocialSiteId.PIXIV_USER_NICKNAME
5435,
306956,
id_version_of=SocialSiteId.PIXIV_USER_NICKNAME,
),
SocialSiteId.PIXIV_USER_NICKNAME: WikidataInfo(
None, 306956, nickname_version_of=SocialSiteId.PIXIV_USER_ID
None,
306956,
nickname_version_of=SocialSiteId.PIXIV_USER_ID,
),
SocialSiteId.MASTODON_PAGE: WikidataInfo(4033, None),
SocialSiteId.PATREON_PAGE: WikidataInfo(4175, 15861362),
@ -145,10 +146,14 @@ WIKIDATA_PROPERTIES: dict[SocialSiteId, WikidataInfo] = {
# SocialSiteId.INPRNT_PAGE: WikidataInfo(None, None),
SocialSiteId.CARRD_PAGE: WikidataInfo(None, 106036503),
SocialSiteId.YOUTUBE_CHANNEL_HANDLE: WikidataInfo(
11245, 866, nickname_version_of=SocialSiteId.YOUTUBE_CHANNEL_ID
11245,
866,
nickname_version_of=SocialSiteId.YOUTUBE_CHANNEL_ID,
),
SocialSiteId.YOUTUBE_CHANNEL_ID: WikidataInfo(
2397, 866, id_version_of=SocialSiteId.YOUTUBE_CHANNEL_HANDLE
2397,
866,
id_version_of=SocialSiteId.YOUTUBE_CHANNEL_HANDLE,
),
SocialSiteId.VIMEO_CHANNEL: WikidataInfo(4015, 156376),
SocialSiteId.NEWGROUNDS_PAGE: WikidataInfo(None, 263655),
@ -201,12 +206,19 @@ def re_social_path_adv(main_domain: str, *path: str) -> str:
if main_domain.startswith('www.'):
msg = f'Redundant www: {main_domain}'
raise ValueError(msg)
regex_builder: list[str] = [r'^', r'(?:https?:\/\/)?', r'(?:www\.)?', re.escape(main_domain)]
regex_builder: list[str] = [
r'^',
r'(?:https?:\/\/)?',
r'(?:www\.)?',
re.escape(main_domain),
]
for p in path:
if p != RE_ANY_SUBPATH:
regex_builder.append(r'\/')
regex_builder.append(p if p in {RE_ID, RE_DUAL_ID, RE_ANY_SUBPATH} else re.escape(p))
regex_builder.append(
p if p in {RE_ID, RE_DUAL_ID, RE_ANY_SUBPATH} else re.escape(p),
)
if path[-1] != RE_ANY_SUBPATH:
regex_builder.append(r'\/?$')
return ''.join(regex_builder)
@ -252,10 +264,16 @@ URL_PARSE_YOUTUBE_CHANNEL_ID = re_social_path_adv('youtube.com', 'channel', RE_I
URL_PARSE_VIMEO_CHANNEL = re_social_path_adv('vimeo.com', RE_ID)
URL_PARSE_NEWGROUNDS_PAGE = re_social_subdomain('newgrounds.com')
URL_PARSE_ARTSY_ARTIST = re_social_path_adv(
'artsy.net', 'artist', RE_ID, RE_ANY_SUBPATH
'artsy.net',
'artist',
RE_ID,
RE_ANY_SUBPATH,
)
URL_PARSE_ARTNET_ARTIST = re_social_path_adv(
'artnet.com', 'artists', RE_ID, RE_ANY_SUBPATH
'artnet.com',
'artists',
RE_ID,
RE_ANY_SUBPATH,
)
URL_PARSE_DEVIANT_ART_ACCOUNT = re_social_path_adv('deviantart.com', RE_ID)
URL_PARSE_DEVIANT_ART_ACCOUNT_2 = re_social_subdomain('deviantart.com')
@ -383,11 +401,13 @@ WELL_KNOWN_MASTODON_INSTANCES: frozenset[str] = frozenset(
'mastodonapp.uk',
'fosstodon.org',
'idlethumbs.social',
}
},
)
def determine_social_from_url_internally(url: str) -> tuple[SocialSiteId | None, str | None]:
def determine_social_from_url_internally(
url: str,
) -> tuple[SocialSiteId | None, str | None]:
assert isinstance(url, str)
# Regexes
@ -409,6 +429,7 @@ def determine_social_from_url_internally(url: str) -> tuple[SocialSiteId | None,
return (None, None)
def to_parse_result(url: str | urllib.parse.ParseResult) -> urllib.parse.ParseResult:
if isinstance(url, str):
return urllib.parse.urlparse(url)
@ -419,14 +440,17 @@ def to_parse_result(url: str | urllib.parse.ParseResult) -> urllib.parse.ParseRe
msg = f'Expected {urllib.parse.ParseResult} or {str}'
raise TypeError(msg)
def determine_social_from_url(url_not_normalized: str | urllib.parse.ParseResult) -> SocialLink | None:
def determine_social_from_url(
url_not_normalized: str | urllib.parse.ParseResult,
) -> SocialLink | None:
url = to_parse_result(url_not_normalized)
(social_site_id, social_id) = determine_social_from_url_internally(
url._replace(query='', fragment='').geturl()
url._replace(query='', fragment='').geturl(),
)
if not social_site_id:
(social_site_id, social_id) = determine_social_from_url_internally(
url._replace(fragment='').geturl()
url._replace(fragment='').geturl(),
)
if not social_site_id:

View File

@ -1,6 +1,7 @@
import socials_util
import pytest
import socials_util
@pytest.mark.parametrize('social_site_id', list(socials_util.SocialSiteId))
def test_consistency(social_site_id):

View File

@ -1,6 +1,7 @@
from socials_util import *
import aenum
from socials_util import *
def test_extension():
MY_SECRET_SITE = aenum.extend_enum(SocialSiteId, 'MY_SECRET_SITE', 666)

View File

@ -1,6 +1,6 @@
from socials_util import *
import pytest
from socials_util import *
PARSABLE_SOCIAL_IDS = [
('http://www.twitter.com/dril', 'dril'),