diff --git a/nemo/retriever-synthetic-data-generation/nemo_retriever_sdg/dataset.py b/nemo/retriever-synthetic-data-generation/nemo_retriever_sdg/dataset.py index a331d3d4..5234a62e 100644 --- a/nemo/retriever-synthetic-data-generation/nemo_retriever_sdg/dataset.py +++ b/nemo/retriever-synthetic-data-generation/nemo_retriever_sdg/dataset.py @@ -72,15 +72,26 @@ def load_rawdoc(cls, name: str): """Load rawdoc format {"text": "...", "title": "..."} {"text": "...", "title": "..."} + or + {"_id": "...", "text": "...", "title": "..."} + {"_id": "...", "text": "...", "title": "..."} ... """ examples = [] for example in [json.loads(line) for line in open(name, "r")]: - examples.append({"paragraphs": [ - {"context": example["text"], - "document_id": example["title"], - "qas": []}], - }) + if "_id" in example: + examples.append({"paragraphs": [ + {"context": example["text"], + "document_id": example["_id"], + "title": example["title"], + "qas": []}], + }) + else: + examples.append({"paragraphs": [ + {"context": example["text"], + "title": example["title"], + "qas": []}], + }) return cls({"data": examples, "version": "2.0"}) def to_json(self, output_path: str): @@ -174,7 +185,10 @@ def to_beir(self, qrels = [] qid = 0 for i, example in enumerate(self.data["data"]): - doc_id = "doc{}".format(i + 1) # "doc1", "doc2", "doc3", ... "docN" + if "document_id" in example['paragraphs'][0]: + doc_id = example['paragraphs'][0]['document_id'] #doc_id + else: + doc_id = "doc{}".format(i + 1) # "doc1", "doc2", "doc3", ... "docN" title = example["paragraphs"][0]["document_id"] # Title text = example["paragraphs"][0]["context"] # TODO: Add information as metadata diff --git a/nemo/retriever-synthetic-data-generation/notebooks/quickstart.ipynb b/nemo/retriever-synthetic-data-generation/notebooks/quickstart.ipynb index c6007a10..2d0cff8e 100644 --- a/nemo/retriever-synthetic-data-generation/notebooks/quickstart.ipynb +++ b/nemo/retriever-synthetic-data-generation/notebooks/quickstart.ipynb @@ -32,6 +32,13 @@ "{\"text\": \"The Eiffel Tower is an iron lattice tower on the Champ de Mars in Paris.\", \"title\": \"Iconic Landmark\" }\n", "...\n", "```\n", + "If document already has id, the pipeline also accepts in the format of `{\"_id\": , \"text\": , \"title\": }`. The same document id would be persisted in the generated results.\n", + "\n", + "```\n", + "{\"_id\": \"3\", \"text\": \"The quick brown fox jumps over the lazy dog.\", \"title\": \"Classic Pangram\" }\n", + "{\"_id\": \"43\", \"text\": \"The Eiffel Tower is an iron lattice tower on the Champ de Mars in Paris.\", \"title\": \"Iconic Landmark\" }\n", + "...\n", + "```\n", "\n", "This repository contains a sample JSONL file `data/sample_data.jsonl`.\n", "\n",