Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not parse dates as prices. Sort imports. #19

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
47 changes: 42 additions & 5 deletions price_parser/parser.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
# -*- coding: utf-8 -*-

import re
import string
from typing import Callable, Optional, Pattern, List, Tuple
from datetime import datetime
from decimal import Decimal, InvalidOperation
from typing import Callable, List, Optional, Pattern, Tuple

import attr

from ._currencies import (CURRENCY_CODES, CURRENCY_NATIONAL_SYMBOLS,
CURRENCY_SYMBOLS)

Expand Down Expand Up @@ -69,7 +72,7 @@ def or_regex(symbols: List[str]) -> Pattern:

# unique currency symbols
'$', '€', '£', 'zł', 'Zł', 'Kč', '₽', '¥', '¥',
'฿', 'դր.', 'դր', '₦', '₴', '₱', '৳', '₭', '₪', '﷼', '៛', '₩', '₫', '₡',
'฿', 'դր.', 'դր', '₦', '₴', '₱', '৳', '₭', '₪', '﷼', '៛', '₩', '₫', '₡',
'টকা', 'ƒ', '₲', '؋', '₮', 'नेरू', '₨',
'₶', '₾', '֏', 'ރ', '৲', '૱', '௹', '₠', '₢', '₣', '₤', '₧', '₯',
'₰', '₳', '₷', '₸', '₹', '₺', '₼', '₾', '₿', 'ℳ',
Expand All @@ -82,7 +85,7 @@ def or_regex(symbols: List[str]) -> Pattern:

# other common symbols, which we consider unambiguous
'EUR', 'euro', 'eur', 'CHF', 'DKK', 'Rp', 'lei',
'руб.', 'руб', 'грн.', 'грн', 'дин.', 'Dinara', 'динар', 'лв.', 'лв',
'руб.', 'руб', 'грн.', 'грн', 'дин.', 'Dinara', 'динар', 'лв.', 'лв',
'р.', 'тңг', 'тңг.', 'ман.',
]

Expand Down Expand Up @@ -139,8 +142,8 @@ def extract_currency_symbol(price: Optional[str],
if price and '$' in price:
methods.insert(0, (_search_dollar_code, price))

for meth, attr in methods:
m = meth(attr) if attr else None
for meth, attrib in methods:
m = meth(attrib) if attrib else None
if m:
return m.group(0)

Expand Down Expand Up @@ -180,6 +183,12 @@ def extract_price_text(price: str) -> Optional[str]:
>>> extract_price_text("50")
'50'
"""

if date_format(price):
bulatbulat48 marked this conversation as resolved.
Show resolved Hide resolved
return None

price = strip_date(price)

if price.count('€') == 1:
m = re.search(r"""
[\d\s.,]*?\d # number, probably with thousand separators
Expand Down Expand Up @@ -283,3 +292,31 @@ def parse_number(num: str) -> Optional[Decimal]:
return Decimal(num)
except InvalidOperation:
return None


def date_format(price):
for fmt in ['%d.%m.%Y', '%B, %Y', '%b, %Y', '%Y-%m-%d']:
try:
date = datetime.strptime(price, fmt)
if isinstance(date, datetime):
bulatbulat48 marked this conversation as resolved.
Show resolved Hide resolved
return date
except (ValueError, TypeError):
continue

return None


def strip_date(text):
# normalize whitspace
text = re.sub(r'\s+', ' ', text)
all_date_regexp = [
r'\d{1,4}-\d{1,2}-\d{2,4}',
r' \S{3,8},\s\d{4}',
]
text_processed = text
for regexp in all_date_regexp:
for match in re.finditer(regexp, text):
if match and date_format(match.group(0).strip()):
bulatbulat48 marked this conversation as resolved.
Show resolved Hide resolved
text_processed = text_processed.replace(match.group(0), '')

return text_processed
50 changes: 49 additions & 1 deletion tests/test_price_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@
we've found in a wild; PRICE_PARSING_EXAMPLES_NEW is a list of tests for
new features. New tests should probably go these two lists.
"""
from typing import Optional, Union
from datetime import datetime
from decimal import Decimal
from typing import Optional, Union

import pytest

from price_parser import Price
from price_parser.parser import date_format, strip_date


class Example(Price):
Expand Down Expand Up @@ -1944,6 +1946,21 @@ def __eq__(self, other):
Example(None, '15.08.2017',
None, None, None),

Example(None, '0€ until May, 2005, 35€ afterwards',
'€', '0', 0),

Example(None, '2019-08-19: 22 USD',
'USD', '22', 22),

Example(None, '2105 EUR at July, 2004',
'EUR', '2105', 2105),

Example(None, '$10 EUR during March, 2016',
'$', '10', 10),

Example(None, '$10 EUR at March, 2016 or 2019-08-19',
'$', '10', 10),
Copy link
Member

@kmike kmike Oct 18, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hey @bulatbulat48! All these tests are added to PRICE_PARSING_EXAMPLES_XFAIL, i.e. a list of examples which are known not to work. It means that if they fail, test suite is green. Could you please move them to PRICE_PARSING_EXAMPLES_BUGS_CAUGHT?

Copy link
Author

@bulatbulat48 bulatbulat48 Oct 19, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. Moved tests. Thank you for the review!


# other incorrectly extracted prices
Example('8.5', '25-09',
None, None, None),
Expand Down Expand Up @@ -1986,3 +2003,34 @@ def test_parsing(example: Example):
)
def test_price_amount_float(amount, amount_float):
assert Price(amount, None, None).amount_float == amount_float


@pytest.mark.parametrize(
"price, result",
[
('10.04.2004', datetime(2004, 4, 10, 0, 0)),
('July, 2004', datetime(2004, 7, 1, 0, 0)),
('Jul, 2004', datetime(2004, 7, 1, 0, 0)),
('200', None),
('2004', None),
(2004, None),
(10.2014, None),
]
)
def test_date_format(price, result):
assert date_format(price) == result
bulatbulat48 marked this conversation as resolved.
Show resolved Hide resolved


@pytest.mark.parametrize(
"price, result",
[
('0€ until May, 2005, 35€ afterwards', '0€ until, 35€ afterwards'),
('2019-08-19: 22 USD', ': 22 USD'),
('105 EUR at July, 2004', '105 EUR at'),
('$10 EUR during March, 2016', '$10 EUR during'),
('$10 EUR during March, 2016 -- March, 2020', '$10 EUR during --'),
('$10', '$10'),
]
)
def test_strip_date(price, result):
assert strip_date(price) == result