diff --git a/number_parser/data/rom.py b/number_parser/data/rom.py new file mode 100644 index 0000000..a8bb44a --- /dev/null +++ b/number_parser/data/rom.py @@ -0,0 +1,44 @@ +info = { + "UNIT_NUMBERS": { + "i": 1, + "ii": 2, + "iii": 3, + "iv": 5, + "vi": 6, + "vii": 7, + "viii": 8, + "ix": 9 + }, + "DIRECT_NUMBERS": { + "x": 10, + + }, + "TENS": { + "xx": 20, + "xxx": 30, + "xl": 40, + "l": 50, + "lx": 60, + "lxx": 70, + "lxxx": 80, + "xc": 90 + }, + "HUNDREDS": { + "c": 100, + "cc": 200, + "ccc": 300, + "cd": 400, + "d": 500, + "dc": 600, + "dcc": 700, + "dccc": 800, + "cm": 900 + }, + "BIG_POWERS_OF_TEN": { + "m": 1000, + "mm": 2000, + "mmm": 3000 + }, + "SKIP_TOKENS": [], + "USE_LONG_SCALE": False +} \ No newline at end of file diff --git a/number_parser/parser.py b/number_parser/parser.py index e0d67c2..f552f4e 100644 --- a/number_parser/parser.py +++ b/number_parser/parser.py @@ -2,7 +2,7 @@ from importlib import import_module import unicodedata SENTENCE_SEPARATORS = [".", ","] -SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru'] +SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru', 'rom'] RE_BUG_LANGUAGES = ['hi'] @@ -141,6 +141,8 @@ def _build_number(token_list, lang_data): def _tokenize(input_string, language): """Breaks string on any non-word character.""" + if language == 'rom': + return re.split("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", input_string.lower()) input_string = input_string.replace('\xad', '') if language in RE_BUG_LANGUAGES: return re.split(r'(\s+)', input_string) @@ -310,6 +312,14 @@ def parse(input_string, language=None): tokens = _tokenize(input_string, language) + if language == 'rom': + tokens = _tokenize(input_string, language=None) + for token in tokens: + if re.search("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", token.lower()): + tokens[tokens.index(token)] = str(parse_number(token, language='rom')) + final_sentance = ''.join(tokens) + return final_sentance + final_sentence = [] current_sentence = [] tokens_taken = []