diff --git a/environments/careqa/README.md b/environments/careqa/README.md index 65d593e1..abc583b7 100644 --- a/environments/careqa/README.md +++ b/environments/careqa/README.md @@ -87,3 +87,34 @@ env_mcq = vf.load_environment("careqa", mode="en", shuffle_answers=True) # Load open-ended environment env_open = vf.load_environment("careqa", mode="open", judge_model="gpt-4o-mini") ``` + +### References + +```bibtex +@inproceedings{arias-duart-etal-2025-automatic, + title = "Automatic Evaluation of Healthcare {LLM}s Beyond Question-Answering", + author = "Arias-Duart, Anna and + Martin-Torres, Pablo Agustin and + Hinjos, Daniel and + Bernabeu-Perez, Pablo and + Ganzabal, Lucia Urcelay and + Mallo, Marta Gonzalez and + Gururajan, Ashwin Kumar and + Lopez-Cuena, Enrique and + Alvarez-Napagao, Sergio and + Garcia-Gasulla, Dario", + editor = "Chiruzzo, Luis and + Ritter, Alan and + Wang, Lu", + booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)", + month = apr, + year = "2025", + address = "Albuquerque, New Mexico", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2025.naacl-short.10/", + pages = "108--130", + ISBN = "979-8-89176-190-2", +} + +``` + diff --git a/environments/healthbench/README.md b/environments/healthbench/README.md index 0a5ab059..37129d62 100644 --- a/environments/healthbench/README.md +++ b/environments/healthbench/README.md @@ -177,3 +177,18 @@ This allows you to see exactly which criteria the model passed or failed, along - Arrays in `info` (criteria, points_list, axes, consensus_criteria) are all aligned by index - the first element of each corresponds to the first rubric criterion - Point values can be negative for undesirable behaviors (e.g., -2 points for "Gives dangerous medical advice") - The total score is normalized to 0-1 regardless of the actual point scale used + + +### References + +```bibtex +@misc{arora2025healthbenchevaluatinglargelanguage, + title={HealthBench: Evaluating Large Language Models Towards Improved Human Health}, + author={Rahul K. Arora and Jason Wei and Rebecca Soskin Hicks and Preston Bowman and Joaquin Quiñonero-Candela and Foivos Tsimpourlas and Michael Sharman and Meghan Shah and Andrea Vallone and Alex Beutel and Johannes Heidecke and Karan Singhal}, + year={2025}, + eprint={2505.08775}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2505.08775}, +} +``` \ No newline at end of file diff --git a/environments/longhealth/README.md b/environments/longhealth/README.md index 2c8fc032..caa79980 100644 --- a/environments/longhealth/README.md +++ b/environments/longhealth/README.md @@ -99,4 +99,15 @@ results = await env.evaluate(client, "gpt-4.1-mini", num_examples=10) ### Authors This environment has been put together by: -Shamus Sim Zi Yang - ([@ss8319](https://github.com/ss8319)) \ No newline at end of file +Shamus Sim Zi Yang - ([@ss8319](https://github.com/ss8319)) + +### References + +```bibtex +@article{adams2024longhealth, + title={LongHealth: A Question Answering Benchmark with Long Clinical Documents}, + author={Adams, Lisa and Busch, Felix and Han, Tianyu and Excoffier, Jean-Baptiste and Ortala, Matthieu and L{\"o}ser, Alexander and Aerts, Hugo JWL and Kather, Jakob Nikolas and Truhn, Daniel and Bressem, Keno}, + journal={arXiv preprint arXiv:2401.14490}, + year={2024} +} +``` \ No newline at end of file diff --git a/environments/m_arc/README.md b/environments/m_arc/README.md index cea7b05d..9e3ef8cc 100644 --- a/environments/m_arc/README.md +++ b/environments/m_arc/README.md @@ -65,3 +65,17 @@ Notes: | `correct_answer_reward_func` | (weight 1.0): 1.0 if parsed letter is correct, else 0.0| + +### References + +```bibtex +@misc{kim2025limitationslargelanguagemodels, + title={Limitations of Large Language Models in Clinical Problem-Solving Arising from Inflexible Reasoning}, + author={Jonathan Kim and Anna Podlasek and Kie Shidara and Feng Liu and Ahmed Alaa and Danilo Bernardo}, + year={2025}, + eprint={2502.04381}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2502.04381}, +} +``` \ No newline at end of file diff --git a/environments/medagentbench/README.md b/environments/medagentbench/README.md index 9038dd53..11962104 100644 --- a/environments/medagentbench/README.md +++ b/environments/medagentbench/README.md @@ -70,3 +70,17 @@ Notes: ### Note This environment is adapted from the original Prime Intellect [MedAgentBench implementation](https://app.primeintellect.ai/dashboard/environments/primeintellect/med-agent-bench). It has been modified to report the query success rate and action success rate as unweighted rewards to match the paper. + + +### References + +```bibtex +@article{jiang2025medagentbench, + title={MedAgentBench: A Virtual EHR Environment to Benchmark Medical LLM Agents}, + author={Jiang, Yixing and Black, Kameron C and Geng, Gloria and Park, Danny and Zou, James and Ng, Andrew Y and Chen, Jonathan H}, + journal={NEJM AI}, + pages={AIdbp2500144}, + year={2025}, + publisher={Massachusetts Medical Society} +} +``` \ No newline at end of file diff --git a/environments/medbullets/README.md b/environments/medbullets/README.md index e848205b..f3411b32 100644 --- a/environments/medbullets/README.md +++ b/environments/medbullets/README.md @@ -60,3 +60,27 @@ Summarize key metrics your rubric emits and how they’re interpreted. | ------ | ------- | | `correct_answer_reward_func` | (weight 1.0): 1.0 if parsed letter is correct, else 0.0| | `parser.get_format_reward_func()` | (weight 0.0): optional format adherence (not counted) | + +### References + +```bibtex +@inproceedings{chen-etal-2025-benchmarking, + title = "Benchmarking Large Language Models on Answering and Explaining Challenging Medical Questions", + author = "Chen, Hanjie and + Fang, Zhouxiang and + Singla, Yash and + Dredze, Mark", + editor = "Chiruzzo, Luis and + Ritter, Alan and + Wang, Lu", + booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)", + month = apr, + year = "2025", + address = "Albuquerque, New Mexico", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2025.naacl-long.182/", + doi = "10.18653/v1/2025.naacl-long.182", + pages = "3563--3599", + ISBN = "979-8-89176-189-6", +} +``` \ No newline at end of file diff --git a/environments/medcalc_bench/README.md b/environments/medcalc_bench/README.md index 609cf4d8..1444567a 100644 --- a/environments/medcalc_bench/README.md +++ b/environments/medcalc_bench/README.md @@ -62,3 +62,17 @@ Notes: ### Adjustments Adjusted the prompt to output the step-by-step thinking and final answer with the and tags instead of responding with a JSON. + + +### References + +```bibtex +@misc{khandekar2024medcalcbench, + title={MedCalc-Bench: Evaluating Large Language Models for Medical Calculations}, + author={Nikhil Khandekar and Qiao Jin and Guangzhi Xiong and Soren Dunn and Serina S Applebaum and Zain Anwar and Maame Sarfo-Gyamfi and Conrad W Safranek and Abid A Anwar and Andrew Zhang and Aidan Gilson and Maxwell B Singer and Amisha Dave and Andrew Taylor and Aidong Zhang and Qingyu Chen and Zhiyong Lu}, + year={2024}, + eprint={2406.12036}, + archivePrefix={arXiv}, + primaryClass={id='cs.CL' full_name='Computation and Language' is_active=True alt_name='cmp-lg' in_archive='cs' is_general=False description='Covers natural language processing. Roughly includes material in ACM Subject Class I.2.7. Note that work on artificial languages (programming languages, logics, formal systems) that does not explicitly address natural-language issues broadly construed (natural-language processing, computational linguistics, speech, text retrieval, etc.) is not appropriate for this area.'} +} +``` \ No newline at end of file diff --git a/environments/medcasereasoning/README.md b/environments/medcasereasoning/README.md index 7d797ee1..41c8e395 100644 --- a/environments/medcasereasoning/README.md +++ b/environments/medcasereasoning/README.md @@ -28,3 +28,14 @@ Configure model and sampling: uv run vf-eval medcasereasoning -m gpt-4.1-mini -n 20 -r 3 -t 1024 -T 0.7 ``` +### References + +```bibtex +@inproceedings{wu2025medcase, + title = {MedCaseReasoning: Evaluating and Learning Diagnostic Reasoning from Clinical Case Reports}, + author = {Wu, Kevin and Wu, Eric and Thapa, Rahul and others}, + booktitle = {NeurIPS}, + year = {2025}, + url = {https://github.com/kevinwu23/MedCaseReasoning} +} +``` \ No newline at end of file diff --git a/environments/medconceptsqa/README.md b/environments/medconceptsqa/README.md index ad4edd8c..27298b8d 100644 --- a/environments/medconceptsqa/README.md +++ b/environments/medconceptsqa/README.md @@ -43,4 +43,21 @@ Summarize key metrics your rubric emits and how they’re interpreted. ### Authors This environment has been put together by: -Anish Mahishi - ([@macandro96](https://github.com/macandro96)) \ No newline at end of file +Anish Mahishi - ([@macandro96](https://github.com/macandro96)) + + + + +### References + +```bibtex +@misc{shoham2024medconceptsqaopensourcemedical, + title={MedConceptsQA: Open Source Medical Concepts QA Benchmark}, + author={Ofir Ben Shoham and Nadav Rappoport}, + year={2024}, + eprint={2405.07348}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2405.07348}, +} +``` \ No newline at end of file diff --git a/environments/medec/README.md b/environments/medec/README.md index 39c930f3..5a1b8474 100644 --- a/environments/medec/README.md +++ b/environments/medec/README.md @@ -115,4 +115,18 @@ The **primary reward score** is the weighted sum of the first three metrics. | `rouge_reward` | 0 | ROUGE-1 F1 score (for analysis only). | | `bertscore_reward` | 0 | BERTScore F1 (for analysis only). | | `bleurt_reward` | 0 | BLEURT score (for analysis only). | -| `reward` | N/A | Final weighted sum of non-zero weight metrics (0.0–1.0). | \ No newline at end of file +| `reward` | N/A | Final weighted sum of non-zero weight metrics (0.0–1.0). | + + +### References + +```bibtex + @article{medec, + author = {Asma {Ben Abacha} and Wen-wai Yim and Yujuan Fu and Zhaoyi Sun and Meliha Yetisgen and Fei Xia and Thomas Lin}, + title = {MEDEC: A Benchmark for Medical Error Detection and Correction in Clinical Notes}, + journal = {CoRR}, + eprinttype = {arXiv}, + url = {https://arxiv.org/pdf/2412.19260}, + year = {2024} + } +``` \ No newline at end of file diff --git a/environments/medredqa/README.md b/environments/medredqa/README.md index d103b747..db236238 100644 --- a/environments/medredqa/README.md +++ b/environments/medredqa/README.md @@ -99,4 +99,34 @@ uv run vf-eval medredqa \ ### Authors This environment has been put together by: -Kunal Bagga - ([@bagga005](https://github.com/bagga005)) \ No newline at end of file +Kunal Bagga - ([@bagga005](https://github.com/bagga005)) + + + +### References + +```bibtex +@inproceedings{nguyen-etal-2023-medredqa, + title = "{M}ed{R}ed{QA} for Medical Consumer Question Answering: Dataset, Tasks, and Neural Baselines", + author = "Nguyen, Vincent and + Karimi, Sarvnaz and + Rybinski, Maciej and + Xing, Zhenchang", + editor = "Park, Jong C. and + Arase, Yuki and + Hu, Baotian and + Lu, Wei and + Wijaya, Derry and + Purwarianti, Ayu and + Krisnadhi, Adila Alfa", + booktitle = "Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)", + month = nov, + year = "2023", + address = "Nusa Dua, Bali", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2023.ijcnlp-main.42/", + doi = "10.18653/v1/2023.ijcnlp-main.42", + pages = "629--648" +} + +``` \ No newline at end of file diff --git a/environments/medxpertqa/README.md b/environments/medxpertqa/README.md index 2b9868fa..8c438c7c 100644 --- a/environments/medxpertqa/README.md +++ b/environments/medxpertqa/README.md @@ -48,3 +48,14 @@ Summarize key metrics your rubric emits and how they’re interpreted. | `reward` | Main scalar reward (weighted sum of criteria) | | `accuracy` | Exact match on target answer | + +### References + +```bibtex +@article{zuo2025medxpertqa, + title={MedXpertQA: Benchmarking Expert-Level Medical Reasoning and Understanding}, + author={Zuo, Yuxin and Qu, Shang and Li, Yifei and Chen, Zhangren and Zhu, Xuekai and Hua, Ermo and Zhang, Kaiyan and Ding, Ning and Zhou, Bowen}, + journal={arXiv preprint arXiv:2501.18362}, + year={2025} +} +``` \ No newline at end of file diff --git a/environments/metamedqa/README.md b/environments/metamedqa/README.md index e5c9675a..cdae4ea5 100644 --- a/environments/metamedqa/README.md +++ b/environments/metamedqa/README.md @@ -29,3 +29,23 @@ The evaluation environment is defined in `metamedqa.py` and uses the HuggingFace This environment has been put together by: Aymane Ouraq - ([@aymaneo](https://github.com/aymaneo)) + + +### References + +```bibtex +@article{griot_large_2025, + title = {Large {Language} {Models} lack essential metacognition for reliable medical reasoning}, + volume = {16}, + issn = {2041-1723}, + url = {https://doi.org/10.1038/s41467-024-55628-6}, + doi = {10.1038/s41467-024-55628-6}, + number = {1}, + journal = {Nature Communications}, + author = {Griot, Maxime and Hemptinne, Coralie and Vanderdonckt, Jean and Yuksel, Demet}, + month = jan, + year = {2025}, + pages = {642}, +} + +``` \ No newline at end of file diff --git a/environments/mmlu_pro_health/README.md b/environments/mmlu_pro_health/README.md index 0d94a017..1bfbf780 100644 --- a/environments/mmlu_pro_health/README.md +++ b/environments/mmlu_pro_health/README.md @@ -59,3 +59,13 @@ Notes: | `accuracy` | (weight 1.0): 1.0 if parsed letter is correct, else 0.0 | +### References + +```bibtex +@article{wang2024mmlu, + title={Mmlu-pro: A more robust and challenging multi-task language understanding benchmark}, + author={Wang, Yubo and Ma, Xueguang and Zhang, Ge and Ni, Yuansheng and Chandra, Abhranil and Guo, Shiguang and Ren, Weiming and Arulraj, Aaran and He, Xuan and Jiang, Ziyan and others}, + journal={arXiv preprint arXiv:2406.01574}, + year={2024} +} +```