Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support numbers as words in English #11

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
161 changes: 142 additions & 19 deletions price_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,52 @@ def extract_currency_symbol(price: Optional[str],
return None


# word: (scale, increment)
_NUMBER_WORDS = {
word: (Decimal(scale), Decimal(increment)) for word, scale, increment in
(
('zero', 1, 0),
('one', 1, 1),
('two', 1, 2),
('three', 1, 3),
('four', 1, 4),
('five', 1, 5),
('six', 1, 6),
('seven', 1, 7),
('eight', 1, 8),
('nine', 1, 9),
('ten', 1, 10),
('eleven', 1, 11),
('twelve', 1, 12),
('thirteen', 1, 13),
('fourteen', 1, 14),
('fifteen', 1, 15),
('sixteen', 1, 16),
('seventeen', 1, 17),
('eighteen', 1, 18),
('nineteen', 1, 19),
('twenty', 1, 20),

('thirty', 1, 30),
('forty', 1, 40),
('fifty', 1, 50),
('sixty', 1, 60),
('seventy', 1, 70),
('eighty', 1, 80),
('ninety', 1, 90),

('hundred', 100, 0),
('thousand', 10 ** 3, 0),
('million', 10 ** 6, 0),
('billion', 10 ** 9, 0),
('trillion', 10 ** 12, 0),

('and', 1, 0),
)
}
_NUMBER_WORDS_PATTERN = r'(?:{})'.format('|'.join(_NUMBER_WORDS))


def extract_price_text(price: str) -> Optional[str]:
"""
Extract text of a price from a string which contains price and
Expand All @@ -179,11 +225,11 @@ def extract_price_text(price: str) -> Optional[str]:
>>> extract_price_text("99 € 79 €")
'99'
>>> extract_price_text("35€ 99")
'35€99'
'35€ 99'
>>> extract_price_text("35€ 999")
'35'
>>> extract_price_text("1,235€ 99")
'1,235€99'
'1,235€ 99'
>>> extract_price_text("50% OFF")
>>> extract_price_text("50%")
>>> extract_price_text("50")
Expand All @@ -192,24 +238,45 @@ def extract_price_text(price: str) -> Optional[str]:
'1 298,00'
>>> extract_price_text("$.75")
'.75'
>>> extract_price_text("$ 4 million")
'4 million'
>>> extract_price_text("four million")
'four million'
>>> extract_price_text("1 thousand 35€ 99")
'1 thousand 35€ 99'
"""
price = re.sub(r'\s+', ' ', price) # clean initial text from non-breaking and extra spaces

m = None
if price.count('€') == 1:
m = re.search(r"""
[\d\s.,]*?\d # number, probably with thousand separators
\s*?€(\s*?)? # euro, probably separated by whitespace
\d(?(1)\d|\d*?) # if separated by whitespace - search one digit, multiple digits otherwise
(?:$|[^\d]) # something which is not a digit
""", price, re.VERBOSE)
if m:
return m.group(0).replace(' ', '')
(
(?:
[\d\s.,]| # number, probably with thousand separators
{} # numeric English words
)*?
\d # there must be a a digit before €
\s*?€(\s*?)? # euro, probably separated by whitespace
\d(?(2)\d|\d*?) # if separated by whitespace - search one digit, multiple digits otherwise
)
(?:$|[^\d]) # something which is not a digit
""".format(_NUMBER_WORDS_PATTERN), price, re.VERBOSE)

m = re.search(r"""
([.]?\d[\d\s.,]*) # number, probably with thousand separators
\s*? # skip whitespace
(?:[^%\d]|$) # capture next symbol - it shouldn't be %
""", price, re.VERBOSE)
if not m:
m = re.search(r"""
(
(?:
\d| # number, as a digit
{0} # numeric English words
)
(?:
[\d\s.,]| # number, probably with thousand separators
{0} # numeric English words
)*
)
\s*? # skip whitespace
(?:[^%\d]|$) # capture next symbol - it shouldn't be %
""".format(_NUMBER_WORDS_PATTERN), price, re.VERBOSE)

if m:
price_text = m.group(1).rstrip(',.')
Expand All @@ -218,8 +285,10 @@ def extract_price_text(price: str) -> Optional[str]:
if price_text.count('.') == 1
else price_text.lstrip(',.').strip()
)

if 'free' in price.lower():
return '0'

return None


Expand Down Expand Up @@ -257,6 +326,56 @@ def get_decimal_separator(price: str) -> Optional[str]:
return m.group(1)


# Based on https://stackoverflow.com/a/493788/939364
def _words_to_digits(words):
"""Given a string containing a number in English, or a combination of
English and digits (e.g. ‘3 million’), return the corresponding number as a
``Decimal``.

In the input string, thousand separators must not exist, and if there are
decimals, a dot (.) must be used as decimal separator, and there must be
digits at either side of the decimal separator.

``None`` is return on invalid input.

>>> parse_number("1234")
Decimal('1234')
>>> parse_number("12.34")
Decimal('12.34')
>>> parse_number("4 million")
Decimal('4000000')
>>> parse_number("four million")
Decimal('4000000')
>>> parse_number("")
>>> parse_number(" ")
>>> parse_number("foo")
"""
if not words.strip():
return None

current = result = Decimal(0)
for word in words.split():
try:
scale, increment = _NUMBER_WORDS[word]
is_word = True
except KeyError:
try:
increment = Decimal(word)
except InvalidOperation:
return None
match = re.match(r'\d+', word)
assert match is not None
scale = Decimal(10) ** len(match[0])
is_word = False

current = current * scale + increment
if scale > Decimal(100) and is_word:
result += current
current = Decimal(0)

return result + current


def parse_number(num: str,
decimal_separator: Optional[str] = None) -> Optional[Decimal]:
""" Parse a string with a number to a Decimal, guessing its format:
Expand Down Expand Up @@ -290,12 +409,18 @@ def parse_number(num: str,
Decimal('140000')
>>> parse_number("140.000", decimal_separator=".")
Decimal('140.000')
>>> parse_number("4 million")
Decimal('4000000')
>>> parse_number("four million")
Decimal('4000000')
>>> parse_number("")
>>> parse_number("foo")
"""
if not num:
return None
num = num.strip().replace(' ', '')

num = num.strip()
num = re.sub(r'\s+(?=\W)|(?<=\W)\s+', '', num)
decimal_separator = decimal_separator or get_decimal_separator(num)
# NOTE: Keep supported separators in sync with _search_decimal_sep
if decimal_separator is None:
Expand All @@ -307,7 +432,5 @@ def parse_number(num: str,
else:
assert decimal_separator == '€'
num = num.replace('.', '').replace(',', '').replace('€', '.')
try:
return Decimal(num)
except InvalidOperation:
return None

return _words_to_digits(num)
20 changes: 20 additions & 0 deletions tests/test_natural_language_numbers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from decimal import Decimal

from pytest import mark

from price_parser import parse_price


@mark.parametrize(
'input_text,amount_text,amount',
[
('$ 4 million', '4 million', 4000000),
('$ four million', 'four million', 4000000),
('$ four million dollars', 'four million', 4000000),
('$ 400 thousand', '400 thousand', 400000),
('$ 1 thousand 999 € 99', '1 thousand 999 € 99', '1999.99'),
],
)
def test_price_parsing(input_text, amount_text, amount):
price = parse_price(input_text)
assert (price.amount_text, price.amount) == (amount_text, Decimal(amount))
4 changes: 2 additions & 2 deletions tests/test_price_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,7 +696,7 @@ def __eq__(self, other):
Example('USD', '349.95',
'USD', '349.95', 349.95),
Example("35€ 99 dont 0,02 € d'éco-part", '35€ 99',
'€', '35€99', 35.99),
'€', '35€ 99', 35.99),
Example('5,72 €', '5,72 €',
'€', '5,72', 5.72),
Example('199,00 грн.', '199,00 грн.',
Expand Down Expand Up @@ -1990,7 +1990,7 @@ def __eq__(self, other):
Example(None, '1250€ 600',
'€', '1250', 1250, "€"),
Example(None, '1250€ 60',
'€', '1250€60', 1250.60, "€"),
'€', '1250€ 60', 1250.60, "€"),
Example(None, '1250€600',
'€', '1250€600', 1250.600, "€"),
Example(None, '.75 €',
Expand Down