Skip to content

Commit bd856ed

Browse files
committed
Prioritize numbers next to currencies
1 parent 4d9c393 commit bd856ed

File tree

2 files changed

+112
-22
lines changed

2 files changed

+112
-22
lines changed

price_parser/parser.py

+60-16
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# -*- coding: utf-8 -*-
22
import re
33
import string
4-
from typing import Callable, Optional, Pattern, List, Tuple
4+
from typing import Callable, Match, Optional, Pattern, List, Tuple
55
from decimal import Decimal, InvalidOperation
66

77
import attr
@@ -36,11 +36,17 @@ def fromstring(cls, price: Optional[str],
3636
``price`` string, it could be **preferred** over a value extracted
3737
from ``currency_hint`` string.
3838
"""
39-
amount_text = extract_price_text(price) if price is not None else None
39+
currency_match, source = _extract_currency_symbol(price, currency_hint)
40+
if price is not None:
41+
_currency_match = currency_match if source == price else None
42+
amount_text = extract_price_text(price, _currency_match)
43+
else:
44+
amount_text = None
4045
amount_num = parse_number(amount_text) if amount_text is not None else None
41-
currency = extract_currency_symbol(price, currency_hint)
42-
if currency is not None:
43-
currency = currency.strip()
46+
if currency_match is not None:
47+
currency = currency_match.group(0).strip()
48+
else:
49+
currency = None
4450
return Price(
4551
amount=amount_num,
4652
currency=currency,
@@ -120,11 +126,11 @@ def or_regex(symbols: List[str]) -> Pattern:
120126
_search_unsafe_currency = or_regex(OTHER_CURRENCY_SYMBOLS).search
121127

122128

123-
def extract_currency_symbol(price: Optional[str],
124-
currency_hint: Optional[str]) -> Optional[str]:
129+
def _extract_currency_symbol(price: Optional[str], currency_hint: Optional[str]) -> Tuple[Optional[Match], Optional[str]]:
125130
"""
126-
Guess currency symbol from extracted price and currency strings.
127-
Return an empty string if symbol is not found.
131+
Guess the currency symbol from extracted price and currency strings.
132+
Return a (`match object`_, source_string) tuple with the symbol found and
133+
the string where it was found, or (None, None) if no symbol is found.
128134
"""
129135
methods: List[Tuple[Callable, Optional[str]]] = [
130136
(_search_safe_currency, price),
@@ -142,17 +148,32 @@ def extract_currency_symbol(price: Optional[str],
142148
for meth, attr in methods:
143149
m = meth(attr) if attr else None
144150
if m:
145-
return m.group(0)
151+
return m, attr
152+
153+
return None, None
146154

155+
156+
def extract_currency_symbol(price: Optional[str],
157+
currency_hint: Optional[str]) -> Optional[str]:
158+
"""
159+
Guess currency symbol from extracted price and currency strings.
160+
Return the symbol as found as a string, or None if no symbol is found.
161+
"""
162+
match, _ = _extract_currency_symbol(price, currency_hint)
163+
if match:
164+
return match.group(0)
147165
return None
148166

149167

150-
def extract_price_text(price: str) -> Optional[str]:
168+
def extract_price_text(price: str, currency_match: Optional[Match] = None) -> Optional[str]:
151169
"""
152170
Extract text of a price from a string which contains price and
153-
maybe some other text. If multiple price-looking substrings are present,
154-
the first is returned (FIXME: it is better to return a number
155-
which is near a currency symbol).
171+
maybe some other text.
172+
173+
If a match object of the currency within the `price` string is provided,
174+
amounts before or after the matched currency substring are prioritized.
175+
Otherwise, if multiple price-looking substrings are present, the first is
176+
returned.
156177
157178
>>> extract_price_text("price: $12.99")
158179
'12.99'
@@ -189,16 +210,39 @@ def extract_price_text(price: str) -> Optional[str]:
189210
""", price, re.VERBOSE)
190211
if m:
191212
return m.group(0).replace(' ', '')
213+
214+
def number_from_match(m):
215+
return m.group(1).strip(',.').strip()
216+
217+
if currency_match is not None:
218+
219+
m = re.search(r"""
220+
(\d[\d\s.,]*) # number, probably with thousand separators
221+
\s*$ # only match right before the currency symbol
222+
""", price[:currency_match.start(0)], re.VERBOSE)
223+
if m:
224+
return number_from_match(m)
225+
226+
m = re.search(r"""
227+
^\s* # only match right after the currency symbol
228+
(\d[\d\s.,]*) # number, probably with thousand separators
229+
\s* # skip whitespace
230+
(?:[^%\d]|$) # capture next symbol - it shouldn't be %
231+
""", price[currency_match.end(0):], re.VERBOSE)
232+
if m:
233+
return number_from_match(m)
234+
192235
m = re.search(r"""
193236
(\d[\d\s.,]*) # number, probably with thousand separators
194237
\s* # skip whitespace
195238
(?:[^%\d]|$) # capture next symbol - it shouldn't be %
196239
""", price, re.VERBOSE)
197-
198240
if m:
199-
return m.group(1).strip(',.').strip()
241+
return number_from_match(m)
242+
200243
if 'free' in price.lower():
201244
return '0'
245+
202246
return None
203247

204248

tests/test_price_parsing.py

+52-6
Original file line numberDiff line numberDiff line change
@@ -617,8 +617,6 @@ def __eq__(self, other):
617617
'Р', '30', 30),
618618
Example('€', '€ 139.00',
619619
'€', '139.00', 139),
620-
Example('There are 163 products.', 'From 26 to 50 €',
621-
'€', '26', 26),
622620
Example('Pris NOK 1 999,00', '139,00',
623621
'NOK', '139,00', 139),
624622
Example('/sqft', '1.52',
@@ -1901,15 +1899,55 @@ def __eq__(self, other):
19011899
'CHF', '19.90', 19.90),
19021900
Example('', '530,42 Zł',
19031901
'Zł', '530,42', 530.42),
1902+
1903+
# Prefer values next to currency symbols
1904+
Example('3 Ausgaben für nur 14,85 EUR', '3 Ausgaben für nur 14,85 EUR',
1905+
'EUR', '14,85', 14.85),
1906+
Example(None, '2 items at 24,00€',
1907+
'€', '24,00', 24.00),
1908+
Example(None, '2 items at 24,00 €',
1909+
'€', '24,00', 24.00),
1910+
Example(None, '2 items at €24,00',
1911+
'€', '24,00', 24.00),
1912+
Example(None, '2 items at € 24,00',
1913+
'€', '24,00', 24.00),
1914+
Example(None, '2 items at 24,00€ or 30,00€',
1915+
'€', '24,00', 24.00),
1916+
Example(None, '2 items at 24,00€ or 30,00 €',
1917+
'€', '24,00', 24.00),
1918+
Example(None, '2 items at 24,00€ or €30,00',
1919+
'€', '24,00', 24.00),
1920+
Example(None, '2 items at 24,00€ or € 30,00',
1921+
'€', '24,00', 24.00),
1922+
Example(None, '2 items at 24,00 € or 30,00€',
1923+
'€', '24,00', 24.00),
1924+
Example(None, '2 items at 24,00 € or 30,00 €',
1925+
'€', '24,00', 24.00),
1926+
Example(None, '2 items at 24,00 € or €30,00',
1927+
'€', '24,00', 24.00),
1928+
Example(None, '2 items at 24,00 € or € 30,00',
1929+
'€', '24,00', 24.00),
1930+
Example(None, '2 items at €24,00 or 30,00€',
1931+
'€', '24,00', 24.00),
1932+
Example(None, '2 items at €24,00 or 30,00 €',
1933+
'€', '24,00', 24.00),
1934+
Example(None, '2 items at €24,00 or €30,00',
1935+
'€', '24,00', 24.00),
1936+
Example(None, '2 items at €24,00 or € 30,00',
1937+
'€', '24,00', 24.00),
1938+
Example(None, '2 items at € 24,00 or 30,00€',
1939+
'€', '24,00', 24.00),
1940+
Example(None, '2 items at € 24,00 or 30,00 €',
1941+
'€', '24,00', 24.00),
1942+
Example(None, '2 items at € 24,00 or €30,00',
1943+
'€', '24,00', 24.00),
1944+
Example(None, '2 items at € 24,00 or € 30,00',
1945+
'€', '24,00', 24.00),
19041946
]
19051947

19061948

19071949
PRICE_PARSING_EXAMPLES_XFAIL = [
19081950
# amount is picked as a price
1909-
Example('3 Ausgaben für nur 14,85 EUR', '3 Ausgaben für nur 14,85 EUR',
1910-
'EUR', '14,85', 14.85),
1911-
Example(None, 'Buy Now - 2 Litre Was $120.00 Now $60.00',
1912-
'$', '60.00', 60),
19131951
Example('Цена: уточняйте (мин. заказ: 1 )', 'Цена: уточняйте (мин. заказ: 1 )',
19141952
None, None, None),
19151953
Example(None, '50 - $2.00 100 - $2.75 400 - $4.50 1,000 - $9.00 2,000 - $17.00 3,000 - $24.00 10,000 - $75.00',
@@ -1923,6 +1961,14 @@ def __eq__(self, other):
19231961
Example('Cuneo', '61.858 L', # Romanian New Leu
19241962
'L', '61.858', 61858),
19251963

1964+
# no handling of price ranges
1965+
Example('There are 163 products.', 'From 26 to 50 €',
1966+
'€', '26', 26),
1967+
1968+
# no handling of old-vs-new prices
1969+
Example(None, 'Buy Now - 2 Litre Was $120.00 Now $60.00',
1970+
'$', '60.00', 60),
1971+
19261972
# "р" / "руб" is detected as currency
19271973
Example('>', 'См. цену в прайсе',
19281974
None, None, None),

0 commit comments

Comments
 (0)