Skip to content

Commit

Permalink
Added WMT23 data source (closes #245) (#247)
Browse files Browse the repository at this point in the history
* Added WMT23 data source
* Version bump to 2.4.0
* use refB by default for en-he and he-en
  • Loading branch information
mjpost authored Dec 12, 2023
1 parent b0ad2cb commit 72213e9
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 2 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Release Notes

- 2.4.0 (2023-12-11)
Added:
- WMT23 test sets (test set `wmt23`)

- 2.3.3 (2023-11-28)
Fixed:
- Typing issues (#249, #250)
Expand All @@ -15,7 +19,7 @@

- 2.3.1 (2022-10-18)
Bugfix:
- Set lru_cache to 2**16 for SPM tokenizer (was set to infinite)
- Set lru_cache to 2^16 for SPM tokenizer (was set to infinite)

- 2.3.0 (2022-10-18)
Features:
Expand Down
2 changes: 1 addition & 1 deletion sacrebleu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.

__version__ = '2.3.3'
__version__ = '2.4.0'
__description__ = 'Hassle-free computation of shareable, comparable, and reproducible BLEU, chrF, and TER scores'


Expand Down
29 changes: 29 additions & 0 deletions sacrebleu/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,35 @@

DATASETS = {
# wmt
"wmt23": WMTXMLDataset(
"wmt23",
data=["https://github.com/wmt-conference/wmt23-news-systems/archive/refs/tags/v.0.1.tar.gz"],
description="Official evaluation and system data for WMT23.",
md5=["63576405e4ce07130a19ad76ba7eb75b"],
langpairs={
"cs-uk": ["wmt23-news-systems-v.0.1/xml/wmttest2023.cs-uk.all.xml"],
"de-en": ["wmt23-news-systems-v.0.1/xml/wmttest2023.de-en.all.xml"],
"en-cs": ["wmt23-news-systems-v.0.1/xml/wmttest2023.en-cs.all.xml"],
"en-de": ["wmt23-news-systems-v.0.1/xml/wmttest2023.en-de.all.xml"],
"en-he": {
"path": "wmt23-news-systems-v.0.1/xml/wmttest2023.en-he.all.xml",
"refs": ["B"],
},
"en-ja": ["wmt23-news-systems-v.0.1/xml/wmttest2023.en-ja.all.xml"],
"en-ru": ["wmt23-news-systems-v.0.1/xml/wmttest2023.en-ru.all.xml"],
"en-uk": ["wmt23-news-systems-v.0.1/xml/wmttest2023.en-uk.all.xml"],
"en-zh": ["wmt23-news-systems-v.0.1/xml/wmttest2023.en-zh.all.xml"],
"he-en": {
"path": "wmt23-news-systems-v.0.1/xml/wmttest2023.he-en.all.xml",
"refs": ["B"],
},
"ja-en": ["wmt23-news-systems-v.0.1/xml/wmttest2023.ja-en.all.xml"],
"ru-en": ["wmt23-news-systems-v.0.1/xml/wmttest2023.ru-en.all.xml"],
"uk-en": ["wmt23-news-systems-v.0.1/xml/wmttest2023.uk-en.all.xml"],
"zh-en": ["wmt23-news-systems-v.0.1/xml/wmttest2023.zh-en.all.xml"],
},
refs=["A"],
),
"wmt22": WMTXMLDataset(
"wmt22",
data=["https://github.com/wmt-conference/wmt22-news-systems/archive/refs/tags/v1.1.tar.gz"],
Expand Down

0 comments on commit 72213e9

Please sign in to comment.