Skip to content

Commit aa76878

Browse files
authored
customer issue fix (#218)
1 parent bab5ff0 commit aa76878

File tree

2 files changed

+27
-6
lines changed

2 files changed

+27
-6
lines changed

nemo/retriever-synthetic-data-generation/nemo_retriever_sdg/dataset.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,15 +72,26 @@ def load_rawdoc(cls, name: str):
7272
"""Load rawdoc format
7373
{"text": "...", "title": "..."}
7474
{"text": "...", "title": "..."}
75+
or
76+
{"_id": "...", "text": "...", "title": "..."}
77+
{"_id": "...", "text": "...", "title": "..."}
7578
...
7679
"""
7780
examples = []
7881
for example in [json.loads(line) for line in open(name, "r")]:
79-
examples.append({"paragraphs": [
80-
{"context": example["text"],
81-
"document_id": example["title"],
82-
"qas": []}],
83-
})
82+
if "_id" in example:
83+
examples.append({"paragraphs": [
84+
{"context": example["text"],
85+
"document_id": example["_id"],
86+
"title": example["title"],
87+
"qas": []}],
88+
})
89+
else:
90+
examples.append({"paragraphs": [
91+
{"context": example["text"],
92+
"title": example["title"],
93+
"qas": []}],
94+
})
8495
return cls({"data": examples, "version": "2.0"})
8596

8697
def to_json(self, output_path: str):
@@ -174,7 +185,10 @@ def to_beir(self,
174185
qrels = []
175186
qid = 0
176187
for i, example in enumerate(self.data["data"]):
177-
doc_id = "doc{}".format(i + 1) # "doc1", "doc2", "doc3", ... "docN"
188+
if "document_id" in example['paragraphs'][0]:
189+
doc_id = example['paragraphs'][0]['document_id'] #doc_id
190+
else:
191+
doc_id = "doc{}".format(i + 1) # "doc1", "doc2", "doc3", ... "docN"
178192
title = example["paragraphs"][0]["document_id"] # Title
179193
text = example["paragraphs"][0]["context"]
180194
# TODO: Add information as metadata

nemo/retriever-synthetic-data-generation/notebooks/quickstart.ipynb

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,13 @@
3232
"{\"text\": \"The Eiffel Tower is an iron lattice tower on the Champ de Mars in Paris.\", \"title\": \"Iconic Landmark\" }\n",
3333
"...\n",
3434
"```\n",
35+
"If document already has id, the pipeline also accepts in the format of `{\"_id\": <document id>, \"text\": <document>, \"title\": <title>}`. The same document id would be persisted in the generated results.\n",
36+
"\n",
37+
"```\n",
38+
"{\"_id\": \"3\", \"text\": \"The quick brown fox jumps over the lazy dog.\", \"title\": \"Classic Pangram\" }\n",
39+
"{\"_id\": \"43\", \"text\": \"The Eiffel Tower is an iron lattice tower on the Champ de Mars in Paris.\", \"title\": \"Iconic Landmark\" }\n",
40+
"...\n",
41+
"```\n",
3542
"\n",
3643
"This repository contains a sample JSONL file `data/sample_data.jsonl`.\n",
3744
"\n",

0 commit comments

Comments
 (0)