Skip to content

Commit 2787185

Browse files
authored
Handle potential memory usage issue due to LRU caching (#167)
We now limit the number of items in the cache to 64K so that the memory usage is capped and does not increase further. This is only relevant for specific use-cases where a metric is repetitively called from API such as when computing rewards in reinforcement learning setups.
1 parent 078c440 commit 2787185

8 files changed

+11
-8
lines changed

CHANGELOG.md

+3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# Release Notes
22

3+
- 2.0.1 (2021-XX-XX)
4+
- Handle potential memory usage issues due to LRU caching in tokenizers (#167)
5+
36
- 2.0.0 (2021-07-XX)
47
- Build: Add Windows and OS X testing to Travis CI.
58
- Improve documentation and type annotations.

sacrebleu/tokenizers/tokenizer_13a.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ def signature(self):
1111
def __init__(self):
1212
self._post_tokenizer = TokenizerRegexp()
1313

14-
@lru_cache(maxsize=None)
14+
@lru_cache(maxsize=2**16)
1515
def __call__(self, line):
1616
"""Tokenizes an input line using a relatively minimal tokenization
1717
that is however equivalent to mteval-v13a, used by WMT.

sacrebleu/tokenizers/tokenizer_char.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ def signature(self):
99
def __init__(self):
1010
pass
1111

12-
@lru_cache(maxsize=None)
12+
@lru_cache(maxsize=2**16)
1313
def __call__(self, line):
1414
"""Tokenizes all the characters in the input line.
1515

sacrebleu/tokenizers/tokenizer_intl.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def __init__(self):
4242
(regex.compile(r'(\p{S})'), r' \1 '),
4343
]
4444

45-
@lru_cache(maxsize=None)
45+
@lru_cache(maxsize=2**16)
4646
def __call__(self, line: str) -> str:
4747
for (_re, repl) in self._re:
4848
line = _re.sub(repl, line)

sacrebleu/tokenizers/tokenizer_ja_mecab.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def __init__(self):
3030
# This asserts that no user dictionary has been loaded
3131
assert d.next is None
3232

33-
@lru_cache(maxsize=None)
33+
@lru_cache(maxsize=2**16)
3434
def __call__(self, line):
3535
"""
3636
Tokenizes an Japanese input line using MeCab morphological analyzer.

sacrebleu/tokenizers/tokenizer_re.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def __init__(self):
2424
# (re.compile(r'\s+'), r' '),
2525
]
2626

27-
@lru_cache(maxsize=None)
27+
@lru_cache(maxsize=2**16)
2828
def __call__(self, line):
2929
"""Common post-processing tokenizer for `13a` and `zh` tokenizers.
3030

sacrebleu/tokenizers/tokenizer_ter.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def __init__(self,
136136
self._asian_support = asian_support
137137
self._case_sensitive = case_sensitive
138138

139-
@lru_cache(maxsize=None)
139+
@lru_cache(maxsize=2**16)
140140
# Although the cache is shared across different instances, same sentence
141141
# queries do not return invalid returns across different instances since
142142
# `self` becomes part of the query as well.

sacrebleu/tokenizers/tokenizer_zh.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def __init__(self):
7878
self._post_tokenizer = TokenizerRegexp()
7979

8080
@staticmethod
81-
@lru_cache(maxsize=None)
81+
@lru_cache(maxsize=2**16)
8282
def _is_chinese_char(uchar):
8383
"""
8484
:param uchar: input char in unicode
@@ -89,7 +89,7 @@ def _is_chinese_char(uchar):
8989
return True
9090
return False
9191

92-
@lru_cache(maxsize=None)
92+
@lru_cache(maxsize=2**16)
9393
def __call__(self, line):
9494
"""The tokenization of Chinese text in this script contains two
9595
steps: separate each Chinese characters (by utf-8 encoding); tokenize

0 commit comments

Comments
 (0)