Skip to content

Commit 109538b

Browse files
committed
Improve cromulent.extract.normalize_dimension to handle values with only partial units (#87).
1 parent b8e8f57 commit 109538b

File tree

2 files changed

+33
-1
lines changed

2 files changed

+33
-1
lines changed

cromulent/extract.py

+12
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@
2222

2323
#mark - Dimensions
2424

25+
NEXT_FINER_DIMENSION_UNIT = {
26+
'inches': None,
27+
'feet': 'inches',
28+
'cm': None,
29+
'fr_feet': 'fr_inches',
30+
'fr_inches': 'ligne'
31+
}
2532
NUMBER_PATTERN = r'((?:\d+\s+\d+/\d+)|(?:\d+/\d+)|(?:\d+(?:[.,]\d+)?))'
2633
UNIT_PATTERN = r'''('|"|d(?:[.]?|uymen)|pouc[e.]s?|in(?:ch(?:es)?|[.]?)|'''\
2734
r'''pieds?|v[.]?|voeten|f(?:eet|oot|t[.]?)|cm)'''
@@ -122,6 +129,8 @@ def parse_simple_dimensions(value, which=None):
122129
value = value.strip()
123130
dims = []
124131
# warnings.warn('DIMENSION: %s' % (value,))
132+
133+
last_unit = None
125134
for match in re.finditer(DIMENSION_RE, value):
126135
# warnings.warn('--> match %s' % (match,))
127136
matched_value = _canonical_value(match.group(2))
@@ -130,10 +139,13 @@ def parse_simple_dimensions(value, which=None):
130139
return None
131140
unit_value = match.group(3)
132141
matched_unit = _canonical_unit(unit_value)
142+
if matched_unit is None:
143+
matched_unit = NEXT_FINER_DIMENSION_UNIT.get(last_unit)
133144
if unit_value and not matched_unit:
134145
warnings.warn('*** not a recognized unit: %s' % (unit_value,))
135146
which = _canonical_which(which)
136147
dim = Dimension(value=matched_value, unit=matched_unit, which=which)
148+
last_unit = matched_unit
137149
dims.append(dim)
138150
if not dims:
139151
return None

tests/test_dimensions.py

+21-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import unittest
44
import pprint
55
from datetime import datetime
6-
from cromulent.extract import Dimension
6+
from cromulent.extract import Dimension, normalized_dimension_object
77
import cromulent.extract
88

99
class TestDateCleaners(unittest.TestCase):
@@ -45,6 +45,10 @@ def test_parse_simple_dimensions(self):
4545
'8 1/4 pouces': [Dimension('8.25', 'inches', None)],
4646
'8 1/8 pouces': [Dimension('8.125', 'inches', None)],
4747
'1': [Dimension('1', None, None)],
48+
49+
# values without a unit that follow values with a unit stay in the same system but using the next-finer unit
50+
'2 pieds 3': [Dimension('2', 'feet', None), Dimension('3', 'inches', None)],
51+
"1' 3": [Dimension('1', 'feet', None), Dimension('3', 'inches', None)],
4852
}
4953

5054
for value, expected in tests.items():
@@ -88,5 +92,21 @@ def test_dimension_cleaner(self):
8892
else:
8993
self.assertIsNone(dims)
9094

95+
def test_normalize_dimension(self):
96+
tests = {
97+
'1 ft, 2 in': ('1 feet, 2 inches', Dimension(value='14.0', unit='inches', which=None)),
98+
'8 1/2 pouces': ('8.5 inches', Dimension(value='8.5', unit='inches', which=None)),
99+
'1 pied 7 pouces': ('1 feet, 7 inches', Dimension(value='19.0', unit='inches', which=None)),
100+
'2 pied 1/2 pouces': ('2 feet, 0.5 inches', Dimension(value='24.5', unit='inches', which=None)),
101+
"4' 8": ('4 feet, 8 inches', Dimension(value='56.0', unit='inches', which=None)),
102+
"1 pied 2": ('1 feet, 2 inches', Dimension(value='14.0', unit='inches', which=None)),
103+
}
104+
for value, expected in tests.items():
105+
elabel, edim = expected
106+
dims = cromulent.extract.parse_simple_dimensions(value)
107+
dim, label = normalized_dimension_object(dims)
108+
self.assertEqual(label, elabel)
109+
self.assertEqual(dim, edim)
110+
91111
if __name__ == '__main__':
92112
unittest.main()

0 commit comments

Comments
 (0)