1
0

Improved parsing performance

This commit is contained in:
Jon Michael Aanes 2025-05-14 21:01:28 +02:00
parent 066b8cdac7
commit bcefee2dad
2 changed files with 94 additions and 44 deletions

View File

@ -2,7 +2,7 @@ import logging
import re import re
from decimal import Decimal from decimal import Decimal
from .data import DKK, USD, Asset, AssetAmount, FiatCurrency from .data import DKK, USD, Asset, AssetAmount, FiatCurrency, CURRENCY_CODES, CURRENCY_SYMBOLS
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -10,6 +10,54 @@ RE_PRICE_RAW = r'\b(?:dkk|sek|usd|nok|eur)?\s*([1-9][\d.]*[\d](?:,\d+)?)\s*(?:,-
RE_PRICE = re.compile(RE_PRICE_RAW, flags=re.IGNORECASE) RE_PRICE = re.compile(RE_PRICE_RAW, flags=re.IGNORECASE)
RE_PRODUCT_PRICE_DK = r'-?\d{1,3}(?:\.?\d{3})*(?:,\d\d\.?)?'
RE_PRODUCT_PRICE_EN = r'-?\d{1,3}(?:,?\d{3})*(?:\.\d\d)?'
RE_PRODUCT_PRICE_AMOUNT = r'(' + RE_PRODUCT_PRICE_DK + '|' + RE_PRODUCT_PRICE_EN + ')'
def parse_amount(price: str) -> Decimal:
if re.fullmatch(RE_PRODUCT_PRICE_DK, price):
price = price.replace('.', '').replace(',', '.')
else:
price = price.replace(',', '')
return Decimal(price)
RE_CURRENCY_CODES = '(?P<code>' + '|'.join(re.escape(c) for c in CURRENCY_CODES) + ')'
RE_CURRENCY_SYMBOLS = (
r'(?P<sym>[' + ''.join(re.escape(c) for c in CURRENCY_SYMBOLS.values()) + '])'
)
RE_SYM_AMOUNT_CODE = re.compile(RE_CURRENCY_SYMBOLS
+ r'\s*'
+ RE_PRODUCT_PRICE_AMOUNT
+ r'(?:\s+'
+ RE_CURRENCY_CODES
+ r')?', flags=re.IGNORECASE)
RE_AMOUNT_SYM_CODE = re.compile(
RE_PRODUCT_PRICE_AMOUNT
+ r'\s*'
+ RE_CURRENCY_SYMBOLS
+ r'(?:\s+'
+ RE_CURRENCY_CODES
+ ')?'
, flags=re.IGNORECASE)
RE_AMOUNT_CODE = re.compile(
RE_PRODUCT_PRICE_AMOUNT + r'\s+' + RE_CURRENCY_CODES
, flags=re.IGNORECASE)
RE_KR_AMOUNT = re.compile(
r'kr\.?\s*(' + RE_PRODUCT_PRICE_AMOUNT + ')',
flags=re.IGNORECASE)
RE_AMOUNT_KR= re.compile(
'(' + RE_PRODUCT_PRICE_AMOUNT + r')\s*kr\.?',
flags=re.IGNORECASE)
def parse_price(text: str, default_currency: Asset) -> AssetAmount | None: def parse_price(text: str, default_currency: Asset) -> AssetAmount | None:
""" """
Attempts to parse price from the given text. Attempts to parse price from the given text.
@ -18,49 +66,24 @@ def parse_price(text: str, default_currency: Asset) -> AssetAmount | None:
""" """
if isinstance(text, AssetAmount): if isinstance(text, AssetAmount):
return text return text
text = str(text) text = str(text).lower().strip()
if m := re.match(r'^Kr\s*([\d.]+(?:,\d+))?$', text): if text == 'free':
return AssetAmount( return AssetAmount(default_currency, Decimal(0))
DKK,
Decimal(m.group(1).replace('.', '').replace(',', '.')),
)
if m := re.match(r'^(\d+)\s*DKK$', text):
return AssetAmount(DKK, Decimal(m.group(1)))
if m := re.match(r'^\$\s*([0-9.]+)(\s+USD)?$', text): code, sym, amount_text = None, None, None
return AssetAmount(USD, Decimal(m.group(1)))
if text.lower().strip() == 'free': if m := RE_SYM_AMOUNT_CODE.fullmatch(text):
return AssetAmount(default_currency, Decimal(0.0)) code, sym, amount_text = m.group('code'), m.group('sym'), m.group(2)
elif m := RE_AMOUNT_SYM_CODE.fullmatch(text):
text = str(text).strip().lower().removesuffix('.') code, sym, amount_text = m.group('code'), m.group('sym'), m.group(1)
if m := RE_PRICE.fullmatch(text): elif m := RE_AMOUNT_CODE.fullmatch(text):
currency = default_currency code, amount_text = m.group('code'), m.group(1)
price_tag = m.group(1).replace('.', '').replace(',', '.') # TODO elif m := (RE_KR_AMOUNT.fullmatch(text) or RE_AMOUNT_KR.fullmatch(text)):
if text.endswith('dkk') or text.startswith('dkk'): code, amount_text = 'DKK', m.group(1)
currency = FiatCurrency('DKK')
elif text.endswith('sek') or text.startswith('sek'):
currency = FiatCurrency('SEK')
elif text.endswith('nok') or text.startswith('nok'):
currency = FiatCurrency('NOK')
elif text.endswith('usd') or text.startswith('usd'):
currency = FiatCurrency('USD')
return AssetAmount(currency, Decimal(price_tag))
logger.warning('Unknown price format: %s', text)
return None
def parse_usd_price(s: str | int) -> AssetAmount:
assert s is not None
if isinstance(s, str) or isinstance(s, int):
text = str(s)
else: else:
text = s.text_content() return None
text = text.strip().replace(',', '').removeprefix('$')
if text in {'-', ''}: currency = CURRENCY_CODES[code.upper()] if code else FiatCurrency.from_currency_symbol(sym)
return AssetAmount(USD, Decimal(0)) # TODO assert currency is not None
dollar_amount = Decimal(text) return AssetAmount(currency, parse_amount(amount_text))
return AssetAmount(USD, dollar_amount)

View File

@ -28,9 +28,36 @@ PRICES_UNPARSABLE = [
@pytest.mark.parametrize(('price_string', 'parsed_amount'), PRICES_PARSABLE) @pytest.mark.parametrize(('price_string', 'parsed_amount'), PRICES_PARSABLE)
def test_parse_price(price_string: str, parsed_amount: AssetAmount): def test_parse_price(price_string: str, parsed_amount: AssetAmount):
result = parse_price(price_string, parsed_amount.asset) result = parse_price(price_string, FiatCurrency.JPY)
assert result == parsed_amount assert result == parsed_amount
@pytest.mark.parametrize('price_string', PRICES_UNPARSABLE) @pytest.mark.parametrize('price_string', PRICES_UNPARSABLE)
def test_parse_unparsable(price_string: str): def test_parse_unparsable(price_string: str):
assert parse_price(price_string, USD) is None assert parse_price(price_string, USD) is None
def parse_asset_amount(text: str) -> AssetAmount:
return parse_price(text, FiatCurrency.JPY)
def test_parse_asset_amount_dkk():
assert parse_asset_amount('1338 DKK').amount == 1338
assert parse_asset_amount('1338,00 DKK').amount == 1338
assert parse_asset_amount('13,38 DKK').amount == Decimal('13.38')
assert parse_asset_amount('13.38 DKK').amount == Decimal('13.38')
assert parse_asset_amount('1338,00. DKK').amount == 1338
assert parse_asset_amount('99,00 kr.').amount == 99
assert parse_asset_amount('kr 825.00').amount == 825
assert parse_asset_amount('kr 825.00').asset == DKK
assert parse_asset_amount('kr 825,00').amount == 825
assert parse_asset_amount('kr 825,00').asset == DKK
assert parse_asset_amount('kr. 825.00').amount == 825
assert parse_asset_amount('kr. 825.00').asset == DKK
def test_parse_asset_amount_usd():
assert parse_asset_amount('$99').amount == 99
assert parse_asset_amount('$99').asset == USD
assert parse_asset_amount('99$ USD').amount == 99
assert parse_asset_amount('99$ USD').asset == USD