Skip to content

Commit

Permalink
customer issue fix (#218)
Browse files Browse the repository at this point in the history
  • Loading branch information
vinay-raman authored Oct 15, 2024
1 parent bab5ff0 commit aa76878
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -72,15 +72,26 @@ def load_rawdoc(cls, name: str):
"""Load rawdoc format
{"text": "...", "title": "..."}
{"text": "...", "title": "..."}
or
{"_id": "...", "text": "...", "title": "..."}
{"_id": "...", "text": "...", "title": "..."}
...
"""
examples = []
for example in [json.loads(line) for line in open(name, "r")]:
examples.append({"paragraphs": [
{"context": example["text"],
"document_id": example["title"],
"qas": []}],
})
if "_id" in example:
examples.append({"paragraphs": [
{"context": example["text"],
"document_id": example["_id"],
"title": example["title"],
"qas": []}],
})
else:
examples.append({"paragraphs": [
{"context": example["text"],
"title": example["title"],
"qas": []}],
})
return cls({"data": examples, "version": "2.0"})

def to_json(self, output_path: str):
Expand Down Expand Up @@ -174,7 +185,10 @@ def to_beir(self,
qrels = []
qid = 0
for i, example in enumerate(self.data["data"]):
doc_id = "doc{}".format(i + 1) # "doc1", "doc2", "doc3", ... "docN"
if "document_id" in example['paragraphs'][0]:
doc_id = example['paragraphs'][0]['document_id'] #doc_id
else:
doc_id = "doc{}".format(i + 1) # "doc1", "doc2", "doc3", ... "docN"
title = example["paragraphs"][0]["document_id"] # Title
text = example["paragraphs"][0]["context"]
# TODO: Add information as metadata
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@
"{\"text\": \"The Eiffel Tower is an iron lattice tower on the Champ de Mars in Paris.\", \"title\": \"Iconic Landmark\" }\n",
"...\n",
"```\n",
"If document already has id, the pipeline also accepts in the format of `{\"_id\": <document id>, \"text\": <document>, \"title\": <title>}`. The same document id would be persisted in the generated results.\n",
"\n",
"```\n",
"{\"_id\": \"3\", \"text\": \"The quick brown fox jumps over the lazy dog.\", \"title\": \"Classic Pangram\" }\n",
"{\"_id\": \"43\", \"text\": \"The Eiffel Tower is an iron lattice tower on the Champ de Mars in Paris.\", \"title\": \"Iconic Landmark\" }\n",
"...\n",
"```\n",
"\n",
"This repository contains a sample JSONL file `data/sample_data.jsonl`.\n",
"\n",
Expand Down

0 comments on commit aa76878

Please sign in to comment.