Handle potential memory usage issue due to LRU caching (#167)

ozancaglayan · web-flow · commit 2787185dd0f8 · 2021-10-19T20:26:56.000+03:00
We now limit the number of items in the cache to 64K so that the memory
usage is capped and does not increase further. This is only relevant
for specific use-cases where a metric is repetitively called from API
such as when computing rewards in reinforcement learning setups.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,8 @@
 # Release Notes
 
+- 2.0.1 (2021-XX-XX)
+  - Handle potential memory usage issues due to LRU caching in tokenizers (#167)
+
 - 2.0.0 (2021-07-XX)
   - Build: Add Windows and OS X testing to Travis CI.
   - Improve documentation and type annotations.
diff --git a/sacrebleu/tokenizers/tokenizer_13a.py b/sacrebleu/tokenizers/tokenizer_13a.py
@@ -11,7 +11,7 @@ def signature(self):
     def __init__(self):
         self._post_tokenizer = TokenizerRegexp()
 
-    @lru_cache(maxsize=None)
+    @lru_cache(maxsize=2**16)
     def __call__(self, line):
         """Tokenizes an input line using a relatively minimal tokenization
         that is however equivalent to mteval-v13a, used by WMT.
diff --git a/sacrebleu/tokenizers/tokenizer_char.py b/sacrebleu/tokenizers/tokenizer_char.py
@@ -9,7 +9,7 @@ def signature(self):
     def __init__(self):
         pass
 
-    @lru_cache(maxsize=None)
+    @lru_cache(maxsize=2**16)
     def __call__(self, line):
         """Tokenizes all the characters in the input line.
 
diff --git a/sacrebleu/tokenizers/tokenizer_intl.py b/sacrebleu/tokenizers/tokenizer_intl.py
@@ -42,7 +42,7 @@ def __init__(self):
             (regex.compile(r'(\p{S})'), r' \1 '),
         ]
 
-    @lru_cache(maxsize=None)
+    @lru_cache(maxsize=2**16)
     def __call__(self, line: str) -> str:
         for (_re, repl) in self._re:
             line = _re.sub(repl, line)
diff --git a/sacrebleu/tokenizers/tokenizer_ja_mecab.py b/sacrebleu/tokenizers/tokenizer_ja_mecab.py
@@ -30,7 +30,7 @@ def __init__(self):
         # This asserts that no user dictionary has been loaded
         assert d.next is None
 
-    @lru_cache(maxsize=None)
+    @lru_cache(maxsize=2**16)
     def __call__(self, line):
         """
         Tokenizes an Japanese input line using MeCab morphological analyzer.
diff --git a/sacrebleu/tokenizers/tokenizer_re.py b/sacrebleu/tokenizers/tokenizer_re.py
@@ -24,7 +24,7 @@ def __init__(self):
             # (re.compile(r'\s+'), r' '),
         ]
 
-    @lru_cache(maxsize=None)
+    @lru_cache(maxsize=2**16)
     def __call__(self, line):
         """Common post-processing tokenizer for `13a` and `zh` tokenizers.
 
diff --git a/sacrebleu/tokenizers/tokenizer_ter.py b/sacrebleu/tokenizers/tokenizer_ter.py
@@ -136,7 +136,7 @@ def __init__(self,
         self._asian_support = asian_support
         self._case_sensitive = case_sensitive
 
-    @lru_cache(maxsize=None)
+    @lru_cache(maxsize=2**16)
     # Although the cache is shared across different instances, same sentence
     # queries do not return invalid returns across different instances since
     # `self` becomes part of the query as well.
diff --git a/sacrebleu/tokenizers/tokenizer_zh.py b/sacrebleu/tokenizers/tokenizer_zh.py
@@ -78,7 +78,7 @@ def __init__(self):
         self._post_tokenizer = TokenizerRegexp()
 
     @staticmethod
-    @lru_cache(maxsize=None)
+    @lru_cache(maxsize=2**16)
     def _is_chinese_char(uchar):
         """
         :param uchar: input char in unicode
@@ -89,7 +89,7 @@ def _is_chinese_char(uchar):
                 return True
         return False
 
-    @lru_cache(maxsize=None)
+    @lru_cache(maxsize=2**16)
     def __call__(self, line):
         """The tokenization of Chinese text in this script contains two
         steps: separate each Chinese characters (by utf-8 encoding); tokenize

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ def __init__(self):`
`42`	`42`	`(regex.compile(r'(\p{S})'), r' \1 '),`
`43`	`43`	`]`
`44`	`44`
`45`		`- @lru_cache(maxsize=None)`
	`45`	`+ @lru_cache(maxsize=2**16)`
`46`	`46`	`def __call__(self, line: str) -> str:`
`47`	`47`	`for (_re, repl) in self._re:`
`48`	`48`	`line = _re.sub(repl, line)`
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ def __init__(self):`
`24`	`24`	`# (re.compile(r'\s+'), r' '),`
`25`	`25`	`]`
`26`	`26`
`27`		`- @lru_cache(maxsize=None)`
	`27`	`+ @lru_cache(maxsize=2**16)`
`28`	`28`	`def __call__(self, line):`
`29`	`29`	"""Common post-processing tokenizer for `13a` and `zh` tokenizers.
`30`	`30`