neulab
diff --git a/‎.gitignore
+2-1 b/‎.gitignore
+2-1
diff --git a/‎README.md
+16-1 b/‎README.md
+16-1
diff --git a/‎examples/create_transform_data_example.py
+5-4 b/‎examples/create_transform_data_example.py
+5-4
diff --git a/‎examples/huggingface_data/huggingface_datasets/dataset_index.json
-1 b/‎examples/huggingface_data/huggingface_datasets/dataset_index.json
-1
diff --git a/‎examples/huggingface_data/huggingface_datasets/reranking_dataset_index.json
-1 b/‎examples/huggingface_data/huggingface_datasets/reranking_dataset_index.json
-1
diff --git a/‎huggingface_data/huggingface_datasets/huggingface_datasets_datafinder_index
-7.31 MB b/‎huggingface_data/huggingface_datasets/huggingface_datasets_datafinder_index
-7.31 MB
@@ -22,8 +22,9 @@ huggingface_data/huggingface_datasets/huggingface_datasets_datafinder_index
 huggingface_data/huggingface_datasets/reranking_dataset_index.json
 huggingface_data/huggingface_models/
 retrieved_dataset_dict/
+result/
+checkpoint/
 status.yaml
-
 # Outputs generated by the colab demo
 trained_model/
 trained_tokenizer/
@@ -95,7 +95,9 @@ If you're interested in contributing to the `prompt2model` project, please
 
 We have [written a paper describing Prompt2Model in detail](https://arxiv.org/abs/2308.12261).
 
-If you use Prompt2Model in your research, please cite our paper:
+If you use Prompt2Model in your research, please cite us!
+
+If you discuss or use the overall prompt2model framework, please reference
 
 ```bibtex
 @misc{prompt2model,
@@ -107,3 +109,16 @@ If you use Prompt2Model in your research, please cite our paper:
       primaryClass={cs.CL}
 }
 ```
+
+If you discuss or use our dataset retrieval and transformation tools, please reference
+
+```bibtex
+@misc{prompt2modeldatatune,
+      title={Better Synthetic Data by Retrieving and Transforming Existing Datasets}, 
+      author={Saumya Gandhi and Ritu Gala and Vijay Viswanathan and Tongshuang Wu and Graham Neubig},
+      year={2024},
+      eprint={2404.14361},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
@@ -33,12 +33,13 @@
 
     # run this pipeline to retrieve relevant datasets, rerank them,
     # and transform them based on the prompt
-    retriever = DescriptionDatasetRetriever()
-    num_points_to_transform = 20
+    total_num_points_to_transform = 20
+    retriever = DescriptionDatasetRetriever(
+        auto_transform_data=True,
+        total_num_points_to_transform=total_num_points_to_transform,
+    )
     retrieved_dataset_dict = retriever.retrieve_dataset_dict(
         prompt_spec,
-        auto_transform_data=True,
-        num_points_to_transform=num_points_to_transform,
     )
 
     # save the final dataset to disk