Skip to content

Commit 42a3bcf

Browse files
committed
Prioritize numbers next to currencies
1 parent 4d9c393 commit 42a3bcf

File tree

2 files changed

+56
-18
lines changed

2 files changed

+56
-18
lines changed

Diff for: price_parser/parser.py

+53-15
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# -*- coding: utf-8 -*-
22
import re
33
import string
4-
from typing import Callable, Optional, Pattern, List, Tuple
4+
from typing import Callable, Match, Optional, Pattern, List, Tuple
55
from decimal import Decimal, InvalidOperation
66

77
import attr
@@ -36,11 +36,11 @@ def fromstring(cls, price: Optional[str],
3636
``price`` string, it could be **preferred** over a value extracted
3737
from ``currency_hint`` string.
3838
"""
39-
amount_text = extract_price_text(price) if price is not None else None
39+
currency, source = _extract_currency_symbol(price, currency_hint)
40+
amount_text = extract_price_text(price, currency if source == price else None) if price is not None else None
4041
amount_num = parse_number(amount_text) if amount_text is not None else None
41-
currency = extract_currency_symbol(price, currency_hint)
4242
if currency is not None:
43-
currency = currency.strip()
43+
currency = currency.group(0).strip()
4444
return Price(
4545
amount=amount_num,
4646
currency=currency,
@@ -120,11 +120,11 @@ def or_regex(symbols: List[str]) -> Pattern:
120120
_search_unsafe_currency = or_regex(OTHER_CURRENCY_SYMBOLS).search
121121

122122

123-
def extract_currency_symbol(price: Optional[str],
124-
currency_hint: Optional[str]) -> Optional[str]:
123+
def _extract_currency_symbol(price: Optional[str], currency_hint: Optional[str]) -> Tuple[Optional[Match], Optional[str]]:
125124
"""
126-
Guess currency symbol from extracted price and currency strings.
127-
Return an empty string if symbol is not found.
125+
Guess the currency symbol from extracted price and currency strings.
126+
Return a (`match object`_, source_string) tuple with the symbol found and
127+
the string where it was found, or (None, None) if no symbol is found.
128128
"""
129129
methods: List[Tuple[Callable, Optional[str]]] = [
130130
(_search_safe_currency, price),
@@ -142,17 +142,32 @@ def extract_currency_symbol(price: Optional[str],
142142
for meth, attr in methods:
143143
m = meth(attr) if attr else None
144144
if m:
145-
return m.group(0)
145+
return m, attr
146+
147+
return None, None
146148

149+
150+
def extract_currency_symbol(price: Optional[str],
151+
currency_hint: Optional[str]) -> Optional[str]:
152+
"""
153+
Guess currency symbol from extracted price and currency strings.
154+
Return the symbol as found as a string, or None if no symbol is found.
155+
"""
156+
match, _ = _extract_currency_symbol(price, currency_hint)
157+
if match:
158+
return match.group(0)
147159
return None
148160

149161

150-
def extract_price_text(price: str) -> Optional[str]:
162+
def extract_price_text(price: str, currency_match: Optional[Match] = None) -> Optional[str]:
151163
"""
152164
Extract text of a price from a string which contains price and
153-
maybe some other text. If multiple price-looking substrings are present,
154-
the first is returned (FIXME: it is better to return a number
155-
which is near a currency symbol).
165+
maybe some other text.
166+
167+
If a match object of the currency within the `price` string is provided,
168+
amounts before or after the matched currency substring are prioritized.
169+
Otherwise, if multiple price-looking substrings are present, the first is
170+
returned.
156171
157172
>>> extract_price_text("price: $12.99")
158173
'12.99'
@@ -189,16 +204,39 @@ def extract_price_text(price: str) -> Optional[str]:
189204
""", price, re.VERBOSE)
190205
if m:
191206
return m.group(0).replace(' ', '')
207+
208+
def number_from_match(m):
209+
return m.group(1).strip(',.').strip()
210+
211+
if currency_match is not None:
212+
213+
m = re.search(r"""
214+
(\d[\d\s.,]*) # number, probably with thousand separators
215+
\s*$ # only match right before the currency symbol
216+
""", price[:currency_match.start(0)], re.VERBOSE)
217+
if m:
218+
return number_from_match(m)
219+
220+
m = re.search(r"""
221+
^\s* # only match right after the currency symbol
222+
(\d[\d\s.,]*) # number, probably with thousand separators
223+
\s* # skip whitespace
224+
(?:[^%\d]|$) # capture next symbol - it shouldn't be %
225+
""", price[currency_match.end(0):], re.VERBOSE)
226+
if m:
227+
return number_from_match(m)
228+
192229
m = re.search(r"""
193230
(\d[\d\s.,]*) # number, probably with thousand separators
194231
\s* # skip whitespace
195232
(?:[^%\d]|$) # capture next symbol - it shouldn't be %
196233
""", price, re.VERBOSE)
197-
198234
if m:
199-
return m.group(1).strip(',.').strip()
235+
return number_from_match(m)
236+
200237
if 'free' in price.lower():
201238
return '0'
239+
202240
return None
203241

204242

Diff for: tests/test_price_parsing.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -618,7 +618,7 @@ def __eq__(self, other):
618618
Example('€', '€ 139.00',
619619
'€', '139.00', 139),
620620
Example('There are 163 products.', 'From 26 to 50 €',
621-
'€', '26', 26),
621+
'€', '50', 50),
622622
Example('Pris NOK 1 999,00', '139,00',
623623
'NOK', '139,00', 139),
624624
Example('/sqft', '1.52',
@@ -1901,13 +1901,13 @@ def __eq__(self, other):
19011901
'CHF', '19.90', 19.90),
19021902
Example('', '530,42 Zł',
19031903
'Zł', '530,42', 530.42),
1904+
Example('3 Ausgaben für nur 14,85 EUR', '3 Ausgaben für nur 14,85 EUR',
1905+
'EUR', '14,85', 14.85),
19041906
]
19051907

19061908

19071909
PRICE_PARSING_EXAMPLES_XFAIL = [
19081910
# amount is picked as a price
1909-
Example('3 Ausgaben für nur 14,85 EUR', '3 Ausgaben für nur 14,85 EUR',
1910-
'EUR', '14,85', 14.85),
19111911
Example(None, 'Buy Now - 2 Litre Was $120.00 Now $60.00',
19121912
'$', '60.00', 60),
19131913
Example('Цена: уточняйте (мин. заказ: 1 )', 'Цена: уточняйте (мин. заказ: 1 )',

0 commit comments

Comments
 (0)