From 386fdd928b3cd01b62d155f55b461d8d2cff8b40 Mon Sep 17 00:00:00 2001 From: AmPhIbIaN26 <43638430+AmPhIbIaN26@users.noreply.github.com> Date: Sat, 10 Apr 2021 23:06:47 +0530 Subject: [PATCH 01/11] Implemented parsing Roman Numerals I have implemented parsing for roman, for the case of parsing any roman numeral in a sentence you have to specify the language. --- number_parser/data/rom.py | 47 +++++++++++++++++++++++++++++++++++++++ number_parser/parser.py | 13 ++++++++++- 2 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 number_parser/data/rom.py diff --git a/number_parser/data/rom.py b/number_parser/data/rom.py new file mode 100644 index 0000000..c05939f --- /dev/null +++ b/number_parser/data/rom.py @@ -0,0 +1,47 @@ +info = { + "UNIT_NUMBERS": { + "i": 1, + "ii": 2, + "iii": 3, + "iv": 5, + "vi": 6, + "vii": 7, + "viii": 8, + "ix": 9 + }, + "DIRECT_NUMBERS": { + "x": 10, + + }, + "TENS": { + "xx": 20, + "xxx": 30, + "xl": 40, + "l": 50, + "lx": 60, + "lxx": 70, + "lxxx": 80, + "xc": 90 + }, + "HUNDREDS": { + "c": 100, + "cc": 200, + "ccc": 300, + "cd": 400, + "d": 500, + "dc": 600, + "dcc": 700, + "dccc": 800, + "cm": 900 + }, + "BIG_POWERS_OF_TEN": { + "m": 1000, + "mm": 2000, + "mmm": 3000 + }, + "SKIP_TOKENS": [ + "-", + "and" + ], + "USE_LONG_SCALE": False +} diff --git a/number_parser/parser.py b/number_parser/parser.py index e0d67c2..59ce810 100644 --- a/number_parser/parser.py +++ b/number_parser/parser.py @@ -1,8 +1,9 @@ import re from importlib import import_module import unicodedata + SENTENCE_SEPARATORS = [".", ","] -SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru'] +SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru', 'rom'] RE_BUG_LANGUAGES = ['hi'] @@ -141,6 +142,8 @@ def _build_number(token_list, lang_data): def _tokenize(input_string, language): """Breaks string on any non-word character.""" + if language == 'rom': + return re.split("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", input_string.lower()) input_string = input_string.replace('\xad', '') if language in RE_BUG_LANGUAGES: return re.split(r'(\s+)', input_string) @@ -310,6 +313,14 @@ def parse(input_string, language=None): tokens = _tokenize(input_string, language) + if language == 'rom': + tokens = _tokenize(input_string, language=None) + for token in tokens: + if re.search("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", token.lower()): + tokens[tokens.index(token)] = str(parse_number(token, language='rom')) + final_sentance = ''.join(tokens) + return final_sentance + final_sentence = [] current_sentence = [] tokens_taken = [] From 81ed627b9644ff18a6271f41458d1f024b384cd0 Mon Sep 17 00:00:00 2001 From: AmPhIbIaN26 <43638430+AmPhIbIaN26@users.noreply.github.com> Date: Thu, 15 Apr 2021 02:07:12 +0530 Subject: [PATCH 02/11] Revert "Implemented parsing Roman Numerals" This reverts commit 386fdd928b3cd01b62d155f55b461d8d2cff8b40. --- number_parser/data/rom.py | 47 --------------------------------------- number_parser/parser.py | 13 +---------- 2 files changed, 1 insertion(+), 59 deletions(-) delete mode 100644 number_parser/data/rom.py diff --git a/number_parser/data/rom.py b/number_parser/data/rom.py deleted file mode 100644 index c05939f..0000000 --- a/number_parser/data/rom.py +++ /dev/null @@ -1,47 +0,0 @@ -info = { - "UNIT_NUMBERS": { - "i": 1, - "ii": 2, - "iii": 3, - "iv": 5, - "vi": 6, - "vii": 7, - "viii": 8, - "ix": 9 - }, - "DIRECT_NUMBERS": { - "x": 10, - - }, - "TENS": { - "xx": 20, - "xxx": 30, - "xl": 40, - "l": 50, - "lx": 60, - "lxx": 70, - "lxxx": 80, - "xc": 90 - }, - "HUNDREDS": { - "c": 100, - "cc": 200, - "ccc": 300, - "cd": 400, - "d": 500, - "dc": 600, - "dcc": 700, - "dccc": 800, - "cm": 900 - }, - "BIG_POWERS_OF_TEN": { - "m": 1000, - "mm": 2000, - "mmm": 3000 - }, - "SKIP_TOKENS": [ - "-", - "and" - ], - "USE_LONG_SCALE": False -} diff --git a/number_parser/parser.py b/number_parser/parser.py index 59ce810..e0d67c2 100644 --- a/number_parser/parser.py +++ b/number_parser/parser.py @@ -1,9 +1,8 @@ import re from importlib import import_module import unicodedata - SENTENCE_SEPARATORS = [".", ","] -SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru', 'rom'] +SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru'] RE_BUG_LANGUAGES = ['hi'] @@ -142,8 +141,6 @@ def _build_number(token_list, lang_data): def _tokenize(input_string, language): """Breaks string on any non-word character.""" - if language == 'rom': - return re.split("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", input_string.lower()) input_string = input_string.replace('\xad', '') if language in RE_BUG_LANGUAGES: return re.split(r'(\s+)', input_string) @@ -313,14 +310,6 @@ def parse(input_string, language=None): tokens = _tokenize(input_string, language) - if language == 'rom': - tokens = _tokenize(input_string, language=None) - for token in tokens: - if re.search("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", token.lower()): - tokens[tokens.index(token)] = str(parse_number(token, language='rom')) - final_sentance = ''.join(tokens) - return final_sentance - final_sentence = [] current_sentence = [] tokens_taken = [] From 6658bc40e28c81771fe84f463e4a71ecaac632f7 Mon Sep 17 00:00:00 2001 From: AmPhIbIaN26 <43638430+AmPhIbIaN26@users.noreply.github.com> Date: Thu, 15 Apr 2021 02:28:07 +0530 Subject: [PATCH 03/11] Implemented parsing Roman Numerals I have implemented parsing for roman, for the case of parsing any roman numeral in a sentence you have to specify the language. --- number_parser/data/rom.py | 44 +++++++++++++++++++++++++++++++++++++++ number_parser/parser.py | 12 ++++++++++- 2 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 number_parser/data/rom.py diff --git a/number_parser/data/rom.py b/number_parser/data/rom.py new file mode 100644 index 0000000..a8bb44a --- /dev/null +++ b/number_parser/data/rom.py @@ -0,0 +1,44 @@ +info = { + "UNIT_NUMBERS": { + "i": 1, + "ii": 2, + "iii": 3, + "iv": 5, + "vi": 6, + "vii": 7, + "viii": 8, + "ix": 9 + }, + "DIRECT_NUMBERS": { + "x": 10, + + }, + "TENS": { + "xx": 20, + "xxx": 30, + "xl": 40, + "l": 50, + "lx": 60, + "lxx": 70, + "lxxx": 80, + "xc": 90 + }, + "HUNDREDS": { + "c": 100, + "cc": 200, + "ccc": 300, + "cd": 400, + "d": 500, + "dc": 600, + "dcc": 700, + "dccc": 800, + "cm": 900 + }, + "BIG_POWERS_OF_TEN": { + "m": 1000, + "mm": 2000, + "mmm": 3000 + }, + "SKIP_TOKENS": [], + "USE_LONG_SCALE": False +} \ No newline at end of file diff --git a/number_parser/parser.py b/number_parser/parser.py index e0d67c2..f552f4e 100644 --- a/number_parser/parser.py +++ b/number_parser/parser.py @@ -2,7 +2,7 @@ from importlib import import_module import unicodedata SENTENCE_SEPARATORS = [".", ","] -SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru'] +SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru', 'rom'] RE_BUG_LANGUAGES = ['hi'] @@ -141,6 +141,8 @@ def _build_number(token_list, lang_data): def _tokenize(input_string, language): """Breaks string on any non-word character.""" + if language == 'rom': + return re.split("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", input_string.lower()) input_string = input_string.replace('\xad', '') if language in RE_BUG_LANGUAGES: return re.split(r'(\s+)', input_string) @@ -310,6 +312,14 @@ def parse(input_string, language=None): tokens = _tokenize(input_string, language) + if language == 'rom': + tokens = _tokenize(input_string, language=None) + for token in tokens: + if re.search("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", token.lower()): + tokens[tokens.index(token)] = str(parse_number(token, language='rom')) + final_sentance = ''.join(tokens) + return final_sentance + final_sentence = [] current_sentence = [] tokens_taken = [] From ff64b93ab6767df981224a2a971748a9da9604f8 Mon Sep 17 00:00:00 2001 From: AmPhIbIaN26 <43638430+AmPhIbIaN26@users.noreply.github.com> Date: Mon, 3 May 2021 23:47:07 +0530 Subject: [PATCH 04/11] Revert "Implemented parsing Roman Numerals" This reverts commit 6658bc40e28c81771fe84f463e4a71ecaac632f7. --- number_parser/data/rom.py | 44 --------------------------------------- number_parser/parser.py | 12 +---------- 2 files changed, 1 insertion(+), 55 deletions(-) delete mode 100644 number_parser/data/rom.py diff --git a/number_parser/data/rom.py b/number_parser/data/rom.py deleted file mode 100644 index a8bb44a..0000000 --- a/number_parser/data/rom.py +++ /dev/null @@ -1,44 +0,0 @@ -info = { - "UNIT_NUMBERS": { - "i": 1, - "ii": 2, - "iii": 3, - "iv": 5, - "vi": 6, - "vii": 7, - "viii": 8, - "ix": 9 - }, - "DIRECT_NUMBERS": { - "x": 10, - - }, - "TENS": { - "xx": 20, - "xxx": 30, - "xl": 40, - "l": 50, - "lx": 60, - "lxx": 70, - "lxxx": 80, - "xc": 90 - }, - "HUNDREDS": { - "c": 100, - "cc": 200, - "ccc": 300, - "cd": 400, - "d": 500, - "dc": 600, - "dcc": 700, - "dccc": 800, - "cm": 900 - }, - "BIG_POWERS_OF_TEN": { - "m": 1000, - "mm": 2000, - "mmm": 3000 - }, - "SKIP_TOKENS": [], - "USE_LONG_SCALE": False -} \ No newline at end of file diff --git a/number_parser/parser.py b/number_parser/parser.py index f552f4e..e0d67c2 100644 --- a/number_parser/parser.py +++ b/number_parser/parser.py @@ -2,7 +2,7 @@ from importlib import import_module import unicodedata SENTENCE_SEPARATORS = [".", ","] -SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru', 'rom'] +SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru'] RE_BUG_LANGUAGES = ['hi'] @@ -141,8 +141,6 @@ def _build_number(token_list, lang_data): def _tokenize(input_string, language): """Breaks string on any non-word character.""" - if language == 'rom': - return re.split("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", input_string.lower()) input_string = input_string.replace('\xad', '') if language in RE_BUG_LANGUAGES: return re.split(r'(\s+)', input_string) @@ -312,14 +310,6 @@ def parse(input_string, language=None): tokens = _tokenize(input_string, language) - if language == 'rom': - tokens = _tokenize(input_string, language=None) - for token in tokens: - if re.search("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", token.lower()): - tokens[tokens.index(token)] = str(parse_number(token, language='rom')) - final_sentance = ''.join(tokens) - return final_sentance - final_sentence = [] current_sentence = [] tokens_taken = [] From c6285cb1ba6e75544a517aa62f2ae64010994d8f Mon Sep 17 00:00:00 2001 From: AmPhIbIaN26 <43638430+AmPhIbIaN26@users.noreply.github.com> Date: Mon, 3 May 2021 23:47:18 +0530 Subject: [PATCH 05/11] Revert "Revert "Implemented parsing Roman Numerals"" This reverts commit 81ed627b9644ff18a6271f41458d1f024b384cd0. --- number_parser/data/rom.py | 47 +++++++++++++++++++++++++++++++++++++++ number_parser/parser.py | 13 ++++++++++- 2 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 number_parser/data/rom.py diff --git a/number_parser/data/rom.py b/number_parser/data/rom.py new file mode 100644 index 0000000..c05939f --- /dev/null +++ b/number_parser/data/rom.py @@ -0,0 +1,47 @@ +info = { + "UNIT_NUMBERS": { + "i": 1, + "ii": 2, + "iii": 3, + "iv": 5, + "vi": 6, + "vii": 7, + "viii": 8, + "ix": 9 + }, + "DIRECT_NUMBERS": { + "x": 10, + + }, + "TENS": { + "xx": 20, + "xxx": 30, + "xl": 40, + "l": 50, + "lx": 60, + "lxx": 70, + "lxxx": 80, + "xc": 90 + }, + "HUNDREDS": { + "c": 100, + "cc": 200, + "ccc": 300, + "cd": 400, + "d": 500, + "dc": 600, + "dcc": 700, + "dccc": 800, + "cm": 900 + }, + "BIG_POWERS_OF_TEN": { + "m": 1000, + "mm": 2000, + "mmm": 3000 + }, + "SKIP_TOKENS": [ + "-", + "and" + ], + "USE_LONG_SCALE": False +} diff --git a/number_parser/parser.py b/number_parser/parser.py index e0d67c2..59ce810 100644 --- a/number_parser/parser.py +++ b/number_parser/parser.py @@ -1,8 +1,9 @@ import re from importlib import import_module import unicodedata + SENTENCE_SEPARATORS = [".", ","] -SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru'] +SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru', 'rom'] RE_BUG_LANGUAGES = ['hi'] @@ -141,6 +142,8 @@ def _build_number(token_list, lang_data): def _tokenize(input_string, language): """Breaks string on any non-word character.""" + if language == 'rom': + return re.split("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", input_string.lower()) input_string = input_string.replace('\xad', '') if language in RE_BUG_LANGUAGES: return re.split(r'(\s+)', input_string) @@ -310,6 +313,14 @@ def parse(input_string, language=None): tokens = _tokenize(input_string, language) + if language == 'rom': + tokens = _tokenize(input_string, language=None) + for token in tokens: + if re.search("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", token.lower()): + tokens[tokens.index(token)] = str(parse_number(token, language='rom')) + final_sentance = ''.join(tokens) + return final_sentance + final_sentence = [] current_sentence = [] tokens_taken = [] From ce8136562ed08767d12355bd6675eb094ab8027b Mon Sep 17 00:00:00 2001 From: AmPhIbIaN26 <43638430+AmPhIbIaN26@users.noreply.github.com> Date: Mon, 3 May 2021 23:47:23 +0530 Subject: [PATCH 06/11] Revert "Implemented parsing Roman Numerals" This reverts commit 386fdd928b3cd01b62d155f55b461d8d2cff8b40. --- number_parser/data/rom.py | 47 --------------------------------------- number_parser/parser.py | 13 +---------- 2 files changed, 1 insertion(+), 59 deletions(-) delete mode 100644 number_parser/data/rom.py diff --git a/number_parser/data/rom.py b/number_parser/data/rom.py deleted file mode 100644 index c05939f..0000000 --- a/number_parser/data/rom.py +++ /dev/null @@ -1,47 +0,0 @@ -info = { - "UNIT_NUMBERS": { - "i": 1, - "ii": 2, - "iii": 3, - "iv": 5, - "vi": 6, - "vii": 7, - "viii": 8, - "ix": 9 - }, - "DIRECT_NUMBERS": { - "x": 10, - - }, - "TENS": { - "xx": 20, - "xxx": 30, - "xl": 40, - "l": 50, - "lx": 60, - "lxx": 70, - "lxxx": 80, - "xc": 90 - }, - "HUNDREDS": { - "c": 100, - "cc": 200, - "ccc": 300, - "cd": 400, - "d": 500, - "dc": 600, - "dcc": 700, - "dccc": 800, - "cm": 900 - }, - "BIG_POWERS_OF_TEN": { - "m": 1000, - "mm": 2000, - "mmm": 3000 - }, - "SKIP_TOKENS": [ - "-", - "and" - ], - "USE_LONG_SCALE": False -} diff --git a/number_parser/parser.py b/number_parser/parser.py index 59ce810..e0d67c2 100644 --- a/number_parser/parser.py +++ b/number_parser/parser.py @@ -1,9 +1,8 @@ import re from importlib import import_module import unicodedata - SENTENCE_SEPARATORS = [".", ","] -SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru', 'rom'] +SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru'] RE_BUG_LANGUAGES = ['hi'] @@ -142,8 +141,6 @@ def _build_number(token_list, lang_data): def _tokenize(input_string, language): """Breaks string on any non-word character.""" - if language == 'rom': - return re.split("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", input_string.lower()) input_string = input_string.replace('\xad', '') if language in RE_BUG_LANGUAGES: return re.split(r'(\s+)', input_string) @@ -313,14 +310,6 @@ def parse(input_string, language=None): tokens = _tokenize(input_string, language) - if language == 'rom': - tokens = _tokenize(input_string, language=None) - for token in tokens: - if re.search("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", token.lower()): - tokens[tokens.index(token)] = str(parse_number(token, language='rom')) - final_sentance = ''.join(tokens) - return final_sentance - final_sentence = [] current_sentence = [] tokens_taken = [] From b36483e3b5fb33ed583ab35c7e7740ac6b93aa25 Mon Sep 17 00:00:00 2001 From: AmPhIbIaN26 <43638430+AmPhIbIaN26@users.noreply.github.com> Date: Mon, 3 May 2021 23:54:25 +0530 Subject: [PATCH 07/11] Update __init__.py added encoding='utf8' to __init__.py --- tests/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/__init__.py b/tests/__init__.py index 31eac93..f634674 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -20,7 +20,7 @@ def get_test_files(path, prefix): def _test_files(path, language, is_ordinal=True): fnx = parse_ordinal if is_ordinal else parse_number for filename in get_test_files(path, f'{language}_'): - with open(filename, "r") as csv_file: + with open(filename, "r", encoding='utf8') as csv_file: csv_reader = csv.DictReader(csv_file) for row in csv_reader: try: From be5d4e32bc631560cd79f7bcdd9ff71803680a10 Mon Sep 17 00:00:00 2001 From: AmPhIbIaN26 <43638430+AmPhIbIaN26@users.noreply.github.com> Date: Mon, 3 May 2021 23:57:20 +0530 Subject: [PATCH 08/11] Revert "Update __init__.py" This reverts commit b36483e3b5fb33ed583ab35c7e7740ac6b93aa25. --- tests/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/__init__.py b/tests/__init__.py index f634674..31eac93 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -20,7 +20,7 @@ def get_test_files(path, prefix): def _test_files(path, language, is_ordinal=True): fnx = parse_ordinal if is_ordinal else parse_number for filename in get_test_files(path, f'{language}_'): - with open(filename, "r", encoding='utf8') as csv_file: + with open(filename, "r") as csv_file: csv_reader = csv.DictReader(csv_file) for row in csv_reader: try: From 8e2834318bc38f719013f9bbfc7d9eff4059d3c6 Mon Sep 17 00:00:00 2001 From: AmPhIbIaN26 <43638430+AmPhIbIaN26@users.noreply.github.com> Date: Mon, 3 May 2021 23:57:22 +0530 Subject: [PATCH 09/11] Revert "Revert "Implemented parsing Roman Numerals"" This reverts commit ce8136562ed08767d12355bd6675eb094ab8027b. --- number_parser/data/rom.py | 47 +++++++++++++++++++++++++++++++++++++++ number_parser/parser.py | 13 ++++++++++- 2 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 number_parser/data/rom.py diff --git a/number_parser/data/rom.py b/number_parser/data/rom.py new file mode 100644 index 0000000..c05939f --- /dev/null +++ b/number_parser/data/rom.py @@ -0,0 +1,47 @@ +info = { + "UNIT_NUMBERS": { + "i": 1, + "ii": 2, + "iii": 3, + "iv": 5, + "vi": 6, + "vii": 7, + "viii": 8, + "ix": 9 + }, + "DIRECT_NUMBERS": { + "x": 10, + + }, + "TENS": { + "xx": 20, + "xxx": 30, + "xl": 40, + "l": 50, + "lx": 60, + "lxx": 70, + "lxxx": 80, + "xc": 90 + }, + "HUNDREDS": { + "c": 100, + "cc": 200, + "ccc": 300, + "cd": 400, + "d": 500, + "dc": 600, + "dcc": 700, + "dccc": 800, + "cm": 900 + }, + "BIG_POWERS_OF_TEN": { + "m": 1000, + "mm": 2000, + "mmm": 3000 + }, + "SKIP_TOKENS": [ + "-", + "and" + ], + "USE_LONG_SCALE": False +} diff --git a/number_parser/parser.py b/number_parser/parser.py index e0d67c2..59ce810 100644 --- a/number_parser/parser.py +++ b/number_parser/parser.py @@ -1,8 +1,9 @@ import re from importlib import import_module import unicodedata + SENTENCE_SEPARATORS = [".", ","] -SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru'] +SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru', 'rom'] RE_BUG_LANGUAGES = ['hi'] @@ -141,6 +142,8 @@ def _build_number(token_list, lang_data): def _tokenize(input_string, language): """Breaks string on any non-word character.""" + if language == 'rom': + return re.split("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", input_string.lower()) input_string = input_string.replace('\xad', '') if language in RE_BUG_LANGUAGES: return re.split(r'(\s+)', input_string) @@ -310,6 +313,14 @@ def parse(input_string, language=None): tokens = _tokenize(input_string, language) + if language == 'rom': + tokens = _tokenize(input_string, language=None) + for token in tokens: + if re.search("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", token.lower()): + tokens[tokens.index(token)] = str(parse_number(token, language='rom')) + final_sentance = ''.join(tokens) + return final_sentance + final_sentence = [] current_sentence = [] tokens_taken = [] From 9a6fb48ed577607212cf49046c764c93291d2f96 Mon Sep 17 00:00:00 2001 From: AmPhIbIaN26 <43638430+AmPhIbIaN26@users.noreply.github.com> Date: Mon, 3 May 2021 23:57:26 +0530 Subject: [PATCH 10/11] Revert "Revert "Revert "Implemented parsing Roman Numerals""" This reverts commit c6285cb1ba6e75544a517aa62f2ae64010994d8f. --- number_parser/data/rom.py | 47 --------------------------------------- number_parser/parser.py | 13 +---------- 2 files changed, 1 insertion(+), 59 deletions(-) delete mode 100644 number_parser/data/rom.py diff --git a/number_parser/data/rom.py b/number_parser/data/rom.py deleted file mode 100644 index c05939f..0000000 --- a/number_parser/data/rom.py +++ /dev/null @@ -1,47 +0,0 @@ -info = { - "UNIT_NUMBERS": { - "i": 1, - "ii": 2, - "iii": 3, - "iv": 5, - "vi": 6, - "vii": 7, - "viii": 8, - "ix": 9 - }, - "DIRECT_NUMBERS": { - "x": 10, - - }, - "TENS": { - "xx": 20, - "xxx": 30, - "xl": 40, - "l": 50, - "lx": 60, - "lxx": 70, - "lxxx": 80, - "xc": 90 - }, - "HUNDREDS": { - "c": 100, - "cc": 200, - "ccc": 300, - "cd": 400, - "d": 500, - "dc": 600, - "dcc": 700, - "dccc": 800, - "cm": 900 - }, - "BIG_POWERS_OF_TEN": { - "m": 1000, - "mm": 2000, - "mmm": 3000 - }, - "SKIP_TOKENS": [ - "-", - "and" - ], - "USE_LONG_SCALE": False -} diff --git a/number_parser/parser.py b/number_parser/parser.py index 59ce810..e0d67c2 100644 --- a/number_parser/parser.py +++ b/number_parser/parser.py @@ -1,9 +1,8 @@ import re from importlib import import_module import unicodedata - SENTENCE_SEPARATORS = [".", ","] -SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru', 'rom'] +SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru'] RE_BUG_LANGUAGES = ['hi'] @@ -142,8 +141,6 @@ def _build_number(token_list, lang_data): def _tokenize(input_string, language): """Breaks string on any non-word character.""" - if language == 'rom': - return re.split("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", input_string.lower()) input_string = input_string.replace('\xad', '') if language in RE_BUG_LANGUAGES: return re.split(r'(\s+)', input_string) @@ -313,14 +310,6 @@ def parse(input_string, language=None): tokens = _tokenize(input_string, language) - if language == 'rom': - tokens = _tokenize(input_string, language=None) - for token in tokens: - if re.search("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", token.lower()): - tokens[tokens.index(token)] = str(parse_number(token, language='rom')) - final_sentance = ''.join(tokens) - return final_sentance - final_sentence = [] current_sentence = [] tokens_taken = [] From 4c127ecd1c5fcb3f118f3ae62fde2e18b987eed4 Mon Sep 17 00:00:00 2001 From: AmPhIbIaN26 <43638430+AmPhIbIaN26@users.noreply.github.com> Date: Mon, 3 May 2021 23:57:29 +0530 Subject: [PATCH 11/11] Revert "Revert "Implemented parsing Roman Numerals"" This reverts commit ff64b93ab6767df981224a2a971748a9da9604f8. --- number_parser/data/rom.py | 44 +++++++++++++++++++++++++++++++++++++++ number_parser/parser.py | 12 ++++++++++- 2 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 number_parser/data/rom.py diff --git a/number_parser/data/rom.py b/number_parser/data/rom.py new file mode 100644 index 0000000..a8bb44a --- /dev/null +++ b/number_parser/data/rom.py @@ -0,0 +1,44 @@ +info = { + "UNIT_NUMBERS": { + "i": 1, + "ii": 2, + "iii": 3, + "iv": 5, + "vi": 6, + "vii": 7, + "viii": 8, + "ix": 9 + }, + "DIRECT_NUMBERS": { + "x": 10, + + }, + "TENS": { + "xx": 20, + "xxx": 30, + "xl": 40, + "l": 50, + "lx": 60, + "lxx": 70, + "lxxx": 80, + "xc": 90 + }, + "HUNDREDS": { + "c": 100, + "cc": 200, + "ccc": 300, + "cd": 400, + "d": 500, + "dc": 600, + "dcc": 700, + "dccc": 800, + "cm": 900 + }, + "BIG_POWERS_OF_TEN": { + "m": 1000, + "mm": 2000, + "mmm": 3000 + }, + "SKIP_TOKENS": [], + "USE_LONG_SCALE": False +} \ No newline at end of file diff --git a/number_parser/parser.py b/number_parser/parser.py index e0d67c2..f552f4e 100644 --- a/number_parser/parser.py +++ b/number_parser/parser.py @@ -2,7 +2,7 @@ from importlib import import_module import unicodedata SENTENCE_SEPARATORS = [".", ","] -SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru'] +SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru', 'rom'] RE_BUG_LANGUAGES = ['hi'] @@ -141,6 +141,8 @@ def _build_number(token_list, lang_data): def _tokenize(input_string, language): """Breaks string on any non-word character.""" + if language == 'rom': + return re.split("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", input_string.lower()) input_string = input_string.replace('\xad', '') if language in RE_BUG_LANGUAGES: return re.split(r'(\s+)', input_string) @@ -310,6 +312,14 @@ def parse(input_string, language=None): tokens = _tokenize(input_string, language) + if language == 'rom': + tokens = _tokenize(input_string, language=None) + for token in tokens: + if re.search("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", token.lower()): + tokens[tokens.index(token)] = str(parse_number(token, language='rom')) + final_sentance = ''.join(tokens) + return final_sentance + final_sentence = [] current_sentence = [] tokens_taken = []