reference.bib

% ----------------SURVEY PAPERS---------------------
@article{yang-etal-2024-diffusion,
  author       = {{Ling Yang,
                  Zhilong Zhang,
                  Yang Song,
                  Shenda Hong,
                  Runsheng Xu,
                  Yue Zhao,
                  Wentao Zhang,
                  Bin Cui, and
                  Ming{-}Hsuan Yang}},
  title        = {{Diffusion Models: {A} Comprehensive Survey of Methods and Applications}},
  journal      = {{ACM} Comput. Surv.},
  volume       = {56},
  number       = {4},
  pages        = {105:1--105:39},
  year         = {2024},
}

@article{croitoru-etal-2023-diffusion,
  author       = {{Florinel{-}Alin Croitoru,
                  Vlad Hondru,
                  Radu Tudor Ionescu, and
                  Mubarak Shah}},
  title        = {{Diffusion Models in Vision: {A} Survey}},
  journal      = {TPAMI},
  volume       = {45},
  number       = {9},
  pages        = {10850--10869},
  year         = {2023},
}

@article{zhang-etal-2023-texttoimage,
  author       = {{Chenshuang Zhang,
                  Chaoning Zhang,
                  Mengchun Zhang, and
                  In So Kweon}},
  title        = {{Text-to-image Diffusion Models in Generative {AI:} {A} Survey}},
  journal      = {CoRR},
  volume       = {abs/2303.07909},
  year         = {2023},
}

@misc{po-etal-2023-state,
      title={{State of the Art on Diffusion Models for Visual Computing}}, 
      author={{Ryan Po, Wang Yifan, Vladislav Golyanik, Kfir Aberman, Jonathan T. Barron, Amit H. Bermano, Eric Ryan Chan, Tali Dekel, Aleksander Holynski, Angjoo Kanazawa, C. Karen Liu, Lingjie Liu, Ben Mildenhall, Matthias Nießner, Björn Ommer, Christian Theobalt, Peter Wonka, and Gordon Wetzstein}},
      year={2023},
      eprint={2310.07204},
      archivePrefix={arXiv},
      primaryClass={cs.AI}
}

@article{ulhaq-etal-2022-effiicent,
  author       = {{Anwaar Ulhaq,
                  Naveed Akhtar, and
                  Ganna Pogrebna}},
  title        = {{Efficient Diffusion Models for Vision: {A} Survey}},
  journal      = {CoRR},
  volume       = {abs/2210.09292},
  year         = {2022},
}

@misc{cao-etal-2024-controllable,
      title={{Controllable Generation with Text-to-Image Diffusion Models: A Survey}}, 
      author={{Pu Cao, Feng Zhou, Qing Song, and Lu Yang}},
      year={2024},
      eprint={2403.04279},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This is a placeholder for PromptCharm. The BibTeX reference will be updated as soon as the metadata is available.


% ----------------TEXT-TO-IMAGE GENERATION---------------------
@inproceedings{baraheem-etal-2020-text,
  author       = {{Samah Saeed Baraheem,
                  Trung{-}Nghia Le, and
                  Tam V. Nguyen}},
  title        = {{Text-to-Image Synthesis via Aesthetic Layout}},
  booktitle    = {ACM MM},
  pages        = {4485--4487},
  year         = {2020},
}

@inproceedings{ding-etal-2021-cogview,
  author       = {{Ming Ding,
                  Zhuoyi Yang,
                  Wenyi Hong,
                  Wendi Zheng,
                  Chang Zhou,
                  Da Yin,
                  Junyang Lin,
                  Xu Zou,
                  Zhou Shao,
                  Hongxia Yang, and
                  Jie Tang}},
  title        = {{CogView: Mastering Text-to-Image Generation via Transformers}},
  booktitle    = {NeurIPS},
  pages        = {19822--19835},
  year         = {2021},
}

@inproceedings{ramesh-etal-2021-dalle1,
  author       = {{Aditya Ramesh,
                  Mikhail Pavlov,
                  Gabriel Goh,
                  Scott Gray,
                  Chelsea Voss,
                  Alec Radford,
                  Mark Chen, and
                  Ilya Sutskever}},
  title        = {{Zero-Shot Text-to-Image Generation}},
  booktitle    = {ICML},
  series       = {Proceedings of Machine Learning Research},
  volume       = {139},
  pages        = {8821--8831},
  year         = {2021},
}

@inproceedings{ruan-etal-2021-daegan,
  author       = {{Shulan Ruan,
                  Yong Zhang,
                  Kun Zhang,
                  Yanbo Fan,
                  Fan Tang,
                  Qi Liu, and
                  Enhong Chen}},
  title        = {{{DAE-GAN:} Dynamic Aspect-aware {GAN} for Text-to-Image Synthesis}},
  booktitle    = {ICCV},
  pages        = {13940--13949},
  year         = {2021},
}

@inproceedings{wang-etal-2021-cycle,
  author       = {{Hao Wang,
                  Guosheng Lin,
                  Steven C. H. Hoi, and
                  Chunyan Miao}},
  title        = {{Cycle-Consistent Inverse {GAN} for Text-to-Image Synthesis}},
  booktitle    = {ACM MM},
  pages        = {630--638},
  year         = {2021},
}

@inproceedings{qiao-etal-2021-rgan,
  author       = {{Yanyuan Qiao,
                  Qi Chen,
                  Chaorui Deng,
                  Ning Ding,
                  Yuankai Qi,
                  Mingkui Tan,
                  Xincheng Ren, and
                  Qi Wu}},
  title        = {{{R-GAN:} Exploring Human-like Way for Reasonable Text-to-Image Synthesis
                  via Generative Adversarial Networks}},
  booktitle    = {ACM MM},
  pages        = {2085--2093},
  year         = {2021},
}

@inproceedings{zhang-etal-2021-ufcbert,
  author       = {{Zhu Zhang,
                  Jianxin Ma,
                  Chang Zhou,
                  Rui Men,
                  Zhikang Li,
                  Ming Ding,
                  Jie Tang,
                  Jingren Zhou, and
                  Hongxia Yang}},
  title        = {{{UFC-BERT:} Unifying Multi-Modal Controls for Conditional Image Synthesis}},
  booktitle    = {NeurIPS},
  pages        = {27196--27208},
  year         = {2021},
}

@inproceedings{rombach-etal-2022-stable-diffusion,
  author       = {{Robin Rombach,
                  Andreas Blattmann,
                  Dominik Lorenz,
                  Patrick Esser, and
                  Bj{\"{o}}rn Ommer}},
  title        = {{High-Resolution Image Synthesis with Latent Diffusion Models}},
  booktitle    = {CVPR},
  pages        = {10674--10685},
  year         = {2022},
}

@inproceedings{gu-etal-2022-vector,
  author       = {{Shuyang Gu,
                  Dong Chen,
                  Jianmin Bao,
                  Fang Wen,
                  Bo Zhang,
                  Dongdong Chen,
                  Lu Yuan, and
                  Baining Guo}},
  title        = {{Vector Quantized Diffusion Model for Text-to-Image Synthesis}},
  booktitle    = {CVPR},
  pages        = {10686--10696},
  year         = {2022},
}

@inproceedings{tao-etal-2022-dfgan,
  author       = {{Ming Tao,
                  Hao Tang,
                  Fei Wu,
                  Xiaoyuan Jing,
                  Bing{-}Kun Bao, and
                  Changsheng Xu}},
  title        = {{{DF-GAN:} {A} Simple and Effective Baseline for Text-to-Image Synthesis}},
  booktitle    = {CVPR},
  pages        = {16494--16504},
  year         = {2022},
}

@inproceedings{zhou-etal-2022-lafite,
  author       = {{Yufan Zhou,
                  Ruiyi Zhang,
                  Changyou Chen,
                  Chunyuan Li,
                  Chris Tensmeyer,
                  Tong Yu,
                  Jiuxiang Gu,
                  Jinhui Xu, and
                  Tong Sun}},
  title        = {{Towards Language-Free Training for Text-to-Image Generation}},
  booktitle    = {CVPR},
  pages        = {17886--17896},
  year         = {2022},
}

@inproceedings{wu-etal-2022-text,
  author       = {{Fuxiang Wu,
                  Liu Liu,
                  Fusheng Hao,
                  Fengxiang He, and
                  Jun Cheng}},
  title        = {{Text-to-Image Synthesis based on Object-Guided Joint-Decoding Transformer}},
  booktitle    = {CVPR},
  pages        = {18092--18101},
  year         = {2022},
}

@inproceedings{li-etal-2022-stylet2i,
  author       = {{Zhiheng Li,
                  Martin Renqiang Min,
                  Kai Li, and
                  Chenliang Xu}},
  title        = {{StyleT2I: Toward Compositional and High-Fidelity Text-to-Image Synthesis}},
  booktitle    = {CVPR},
  pages        = {18176--18186},
  year         = {2022},
}

@inproceedings{kim-etal-2022-diffusionclip,
  author       = {{Gwanghyun Kim,
                  Taesung Kwon, and
                  Jong Chul Ye}},
  title        = {{DiffusionCLIP: Text-Guided Diffusion Models for Robust Image Manipulation}},
  booktitle    = {CVPR},
  pages        = {2416--2425},
  year         = {2022},
}

@inproceedings{ding-etal-2022-cogview2,
  author       = {{Ming Ding,
                  Wendi Zheng,
                  Wenyi Hong, and
                  Jie Tang}},
  title        = {{CogView2: Faster and Better Text-to-Image Generation via Hierarchical
                  Transformers}},
  booktitle    = {NeurIPS},
  year         = {2022},
}

@inproceedings{saharia-etal-2022-imagen,
  author       = {{Chitwan Saharia,
                  William Chan,
                  Saurabh Saxena,
                  Lala Li,
                  Jay Whang,
                  Emily L. Denton,
                  Seyed Kamyar Seyed Ghasemipour,
                  Raphael Gontijo Lopes,
                  Burcu Karagol Ayan,
                  Tim Salimans,
                  Jonathan Ho,
                  David J. Fleet, and
                  Mohammad Norouzi}},
  title        = {{Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding}},
  booktitle    = {NeurIPS},
  year         = {2022},
}

@article{ramesh-etal-2022-dalle2,
  author       = {{Aditya Ramesh,
                  Prafulla Dhariwal,
                  Alex Nichol,
                  Casey Chu, and
                  Mark Chen}},
  title        = {{Hierarchical Text-Conditional Image Generation with {CLIP} Latents}},
  journal      = {CoRR},
  volume       = {abs/2204.06125},
  year         = {2022},
}

@inproceedings{lee-etal-2022-autoregressive,
  author       = {{Doyup Lee,
                  Chiheon Kim,
                  Saehoon Kim,
                  Minsu Cho, and
                  Wook{-}Shin Han}},
  title        = {{Autoregressive Image Generation using Residual Quantization}},
  booktitle    = {CVPR},
  pages        = {11513--11522},
  year         = {2022},
}

@inproceedings{liao-etal-2022-text,
  author       = {{Wentong Liao,
                  Kai Hu,
                  Michael Ying Yang, and
                  Bodo Rosenhahn}},
  title        = {{Text to Image Generation with Semantic-Spatial Aware {GAN}}},
  booktitle    = {CVPR},
  pages        = {18166--18175},
  year         = {2022},
}

@inproceedings{gafni-etal-2022-make-a-scene,
  author       = {{Oran Gafni,
                  Adam Polyak,
                  Oron Ashual,
                  Shelly Sheynin,
                  Devi Parikh, and
                  Yaniv Taigman}},
  title        = {{Make-A-Scene: Scene-Based Text-to-Image Generation with Human Priors}},
  booktitle    = {ECCV},
  volume       = {13675},
  pages        = {89--106},
  year         = {2022},
}

@inproceedings{yan-etal-2022-trace,
  author       = {{Kun Yan,
                  Lei Ji,
                  Chenfei Wu,
                  Jianmin Bao,
                  Ming Zhou,
                  Nan Duan, and
                  Shuai Ma}},
  title        = {{Trace Controlled Text to Image Generation}},
  booktitle    = {ECCV},
  volume       = {13696},
  pages        = {59--75},
  year         = {2022},
}

@inproceedings{lezama-etal-2022-improved,
  author       = {{Jos{\'{e}} Lezama,
                  Huiwen Chang,
                  Lu Jiang, and
                  Irfan Essa}},
  title        = {{Improved Masked Image Generation with Token-Critic}},
  booktitle    = {ECCV},
  volume       = {13683},
  pages        = {70--86},
  year         = {2022},
}

@inproceedings{crowson-etal-2022-vqganclip,
  author       = {{Katherine Crowson,
                  Stella Biderman,
                  Daniel Kornis,
                  Dashiell Stander,
                  Eric Hallahan,
                  Louis Castricato, and
                  Edward Raff}},
  title        = {{{VQGAN-CLIP:} Open Domain Image Generation and Editing with Natural
                  Language Guidance}},
  booktitle    = {ECCV},
  volume       = {13697},
  pages        = {88--105},
  year         = {2022},
}

@inproceedings{dinh-etal-2022-tise,
  author       = {{Tan M. Dinh,
                  Rang Nguyen, and
                  Binh{-}Son Hua}},
  title        = {{{TISE:} Bag of Metrics for Text-to-Image Synthesis Evaluation}},
  booktitle    = {ECCV},
  volume       = {13696},
  pages        = {594--609},
  year         = {2022},
}

@inproceedings{maharana-etal-2022-storydalle,
  author       = {{Adyasha Maharana,
                  Darryl Hannan, and
                  Mohit Bansal}},
  title        = {{StoryDALL-E: Adapting Pretrained Text-to-Image Transformers for Story
                  Continuation}},
  booktitle    = {ECCV},
  volume       = {13697},
  pages        = {70--87},
  year         = {2022},
}

@inproceedings{wu-etal-2022-admagan,
  author       = {{Xintian Wu,
                  Hanbin Zhao,
                  Liangli Zheng,
                  Shouhong Ding, and
                  Xi Li}},
  title        = {{Adma-GAN: Attribute-Driven Memory Augmented GANs for Text-to-Image
                  Generation}},
  booktitle    = {ACM MM},
  pages        = {1593--1602},
  publisher    = {{ACM}},
  year         = {2022},
}

@inproceedings{chen-etal-2022-background,
  author       = {{Zhuowei Chen,
                  Zhendong Mao,
                  Shancheng Fang, and
                  Bo Hu}},
  title        = {{Background Layout Generation and Object Knowledge Transfer for Text-to-Image
                  Generation}},
  booktitle    = {ACM MM},
  pages        = {4327--4335},
  year         = {2022},
}

@inproceedings{huang-etal-2022-dsegan,
  author       = {{Mengqi Huang,
                  Zhendong Mao,
                  Penghui Wang,
                  Quan Wang, and
                  Yongdong Zhang}},
  title        = {{{DSE-GAN:} Dynamic Semantic Evolution Generative Adversarial Network
                  for Text-to-Image Generation}},
  booktitle    = {ACM MM},
  pages        = {4345--4354},
  year         = {2022},
}

@inproceedings{shi-etal-2022-athom,
  author       = {{Zhenbo Shi,
                  Zhi Chen,
                  Zhenbo Xu,
                  Wei Yang, and
                  Liusheng Huang}},
  title        = {{AtHom: Two Divergent Attentions Stimulated By Homomorphic Training
                  in Text-to-Image Synthesis}},
  booktitle    = {ACM MM},
  pages        = {2211--2219},
  year         = {2022},
}

@inproceedings{xu2023imagereward,
  title={{ImageReward: Learning and Evaluating Human Preferences for Text-to-Image Generation}},
  author={{Jiazheng Xu, Xiao Liu, Yuchen Wu, Yuxuan Tong, Qinkai Li, Ming Ding, Jie Tang, and Yuxiao Dong}},
  booktitle={NeurIPS},
  year={2023},
}

@inproceedings{betker-etal-2023-dalle3,
  title={{Improving Image Generation with Better Captions}},
  author={{James Betker, Gabriel Goh, Li Jing, † TimBrooks, Jianfeng Wang, Linjie Li, † LongOuyang, † JuntangZhuang, † JoyceLee, † YufeiGuo, † WesamManassra, † PrafullaDhariwal, † CaseyChu, † YunxinJiao, and Aditya Ramesh}},
}

@article{voynov-etal-2023-promptplus,
  author       = {{Andrey Voynov,
                  Qinghao Chu,
                  Daniel Cohen{-}Or, and
                  Kfir Aberman}},
  title        = {{{P+:} Extended Textual Conditioning in Text-to-Image Generation}},
  journal      = {CoRR},
  volume       = {abs/2303.09522},
  year         = {2023},
}

@article{podell-etal-2023-sdxl,
  author       = {{Dustin Podell,
                  Zion English,
                  Kyle Lacey,
                  Andreas Blattmann,
                  Tim Dockhorn,
                  Jonas M{\"{u}}ller,
                  Joe Penna, and
                  Robin Rombach}},
  title        = {{{SDXL:} Improving Latent Diffusion Models for High-Resolution Image
                  Synthesis}},
  journal      = {CoRR},
  volume       = {abs/2307.01952},
  year         = {2023},
}

@article{sauer-etal-2023-sdxl-turbo,
  author       = {{Axel Sauer,
                  Dominik Lorenz,
                  Andreas Blattmann, and
                  Robin Rombach}},
  title        = {{Adversarial Diffusion Distillation}},
  journal      = {CoRR},
  volume       = {abs/2311.17042},
  year         = {2023},
}

@inproceedings{sauer-etal-2023-stylegant,
  author       = {{Axel Sauer,
                  Tero Karras,
                  Samuli Laine,
                  Andreas Geiger, and
                  Timo Aila}},
  title        = {{StyleGAN-T: Unlocking the Power of GANs for Fast Large-Scale Text-to-Image
                  Synthesis}},
  booktitle    = {ICML},
  volume       = {202},
  pages        = {30105--30118},
  year         = {2023},
}

@inproceedings{kang-etal-2023-gigagan,
  author       = {{Minguk Kang,
                  Jun{-}Yan Zhu,
                  Richard Zhang,
                  Jaesik Park,
                  Eli Shechtman,
                  Sylvain Paris, and
                  Taesung Park}},
  title        = {{Scaling up GANs for Text-to-Image Synthesis}},
  booktitle    = {CVPR},
  pages        = {10124--10134},
  year         = {2023},
}

@misc{pernias-etal-2023-wuerstchen,
      title={{Wuerstchen: An Efficient Architecture for Large-Scale Text-to-Image Diffusion Models}}, 
      author={{Pablo Pernias, Dominic Rampas, Mats L. Richter, Christopher J. Pal, and Marc Aubreville}},
      year={2023},
      eprint={2306.00637},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{feng-etal-2023-ernie-vilg,
  author       = {{Zhida Feng,
                  Zhenyu Zhang,
                  Xintong Yu,
                  Yewei Fang,
                  Lanxin Li,
                  Xuyi Chen,
                  Yuxiang Lu,
                  Jiaxiang Liu,
                  Weichong Yin,
                  Shikun Feng,
                  Yu Sun,
                  Li Chen,
                  Hao Tian,
                  Hua Wu, and
                  Haifeng Wang}},
  title        = {{ERNIE-ViLG 2.0: Improving Text-to-Image Diffusion Model with Knowledge-Enhanced
                  Mixture-of-Denoising-Experts}},
  booktitle    = {CVPR},
  pages        = {10135--10145},
  year         = {2023},
}

@inproceedings{zhong-etal-2023-sur-adapter,
  author       = {{Shanshan Zhong,
                  Zhongzhan Huang,
                  Wushao Wen,
                  Jinghui Qin, and
                  Liang Lin}},
  title        = {{SUR-adapter: Enhancing Text-to-Image Pre-trained Diffusion Models
                  with Large Language Models}},
  booktitle    = {ACM MM},
  pages        = {567--578},
  year         = {2023},
}

@article{chefer-etal-2023-attend-and-excite,
  author       = {{Hila Chefer,
                  Yuval Alaluf,
                  Yael Vinker,
                  Lior Wolf, and
                  Daniel Cohen{-}Or}},
  title        = {{Attend-and-Excite: Attention-Based Semantic Guidance for Text-to-Image
                  Diffusion Models}},
  journal      = {SIGGRAPH},
  pages        = {148:1--148:10},
  year         = {2023},
}

@inproceedings{chang-etal-2023-muse,
  author       = {{Huiwen Chang,
                  Han Zhang,
                  Jarred Barber,
                  Aaron Maschinot,
                  Jos{\'{e}} Lezama,
                  Lu Jiang,
                  Ming{-}Hsuan Yang,
                  Kevin Patrick Murphy,
                  William T. Freeman,
                  Michael Rubinstein,
                  Yuanzhen Li, and
                  Dilip Krishnan}},
  title        = {{Muse: Text-To-Image Generation via Masked Generative Transformers}},
  booktitle    = {ICML},
  pages        = {4055--4075},
  year         = {2023},
}

@inproceedings{zhou-etal-2023-shifted,
  author       = {{Yufan Zhou,
                  Bingchen Liu,
                  Yizhe Zhu,
                  Xiao Yang,
                  Changyou Chen, and
                  Jinhui Xu}},
  title        = {{Shifted Diffusion for Text-to-image Generation}},
  booktitle    = {CVPR},
  pages        = {10157--10166},
  year         = {2023},
}

@inproceedings{tao-etal-2023-galip,
  author       = {{Ming Tao,
                  Bing{-}Kun Bao,
                  Hao Tang, and
                  Changsheng Xu}},
  title        = {{{GALIP:} Generative Adversarial CLIPs for Text-to-Image Synthesis}},
  booktitle    = {CVPR},
  pages        = {14214--14223},
  year         = {2023},
}

@inproceedings{lu-etal-2023-specialist-diffusion,
  author       = {{Haoming Lu,
                  Hazarapet Tunanyan,
                  Kai Wang,
                  Shant Navasardyan,
                  Zhangyang Wang, and
                  Humphrey Shi}},
  title        = {{Specialist Diffusion: Plug-and-Play Sample-Efficient Fine-Tuning of
                  Text-to-Image Diffusion Models to Learn Any Unseen Style}},
  booktitle    = {CVPR},
  pages        = {14267--14276},
  year         = {2023},
}

@inproceedings{otani-etal-2023-toward,
  author       = {{Mayu Otani,
                  Riku Togashi,
                  Yu Sawai,
                  Ryosuke Ishigami,
                  Yuta Nakashima,
                  Esa Rahtu,
                  Janne Heikkil{\"{a}}, and
                  Shin'ichi Satoh}},
  title        = {{Toward Verifiable and Reproducible Human Evaluation for Text-to-Image
                  Generation}},
  booktitle    = {CVPR},
  pages        = {14277--14286},
  year         = {2023},
}

@inproceedings{liu-etal-2023-riatig,
  author       = {{Han Liu,
                  Yuhao Wu,
                  Shixuan Zhai,
                  Bo Yuan, and
                  Ning Zhang}},
  title        = {{{RIATIG:} Reliable and Imperceptible Adversarial Text-to-Image Generation
                  with Natural Prompts}},
  booktitle    = {CVPR},
  pages        = {20585--20594},
  year         = {2023},
}

@misc{kodaira-etal-2023-streamdiffusion,
      title={{StreamDiffusion: A Pipeline-level Solution for Real-time Interactive Generation}}, 
      author={{Akio Kodaira, Chenfeng Xu, Toshiki Hazama, Takanori Yoshimoto, Kohei Ohno, Shogo Mitsuhori, Soichi Sugano, Hanying Cho, Zhijian Liu, and Kurt Keutzer}},
      year={2023},
      eprint={2312.12491},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{chen-etal-2023-controlstyle,
  author       = {{Jingwen Chen,
                  Yingwei Pan,
                  Ting Yao, and
                  Tao Mei}},
  title        = {{ControlStyle: Text-Driven Stylized Image Generation Using Diffusion
                  Priors}},
  booktitle    = {ACM MM},
  pages        = {7540--7548},
  year         = {2023},
}

@inproceedings{xue-etal-2023-raphael,
  author       = {{Zeyue Xue,
                  Guanglu Song,
                  Qiushan Guo,
                  Boxiao Liu,
                  Zhuofan Zong,
                  Yu Liu, and
                  Ping Luo}},
  title        = {{{RAPHAEL:} Text-to-Image Generation via Large Mixture of Diffusion
                  Paths}},
  booktitle    = {NeurIPS},
  year         = {2023},
}

@inproceedings{xie-etal-2023-difffit,
  author       = {{Enze Xie,
                  Lewei Yao,
                  Han Shi,
                  Zhili Liu,
                  Daquan Zhou,
                  Zhaoqiang Liu,
                  Jiawei Li, and
                  Zhenguo Li}},
  title        = {{DiffFit: Unlocking Transferability of Large Diffusion Models via Simple
                  Parameter-Efficient Fine-Tuning}},
  booktitle    = {ICCV},
  pages        = {4207--4216},
  year         = {2023},
}

@inproceedings{rassin-etal-2023-linguistic,
  author       = {{Royi Rassin,
                  Eran Hirsch,
                  Daniel Glickman,
                  Shauli Ravfogel,
                  Yoav Goldberg, and
                  Gal Chechik}},
  title        = {{Linguistic Binding in Diffusion Models: Enhancing Attribute Correspondence
                  through Attention Map Alignment}},
  booktitle    = {NeurIPS},
  year         = {2023},
}

@misc{yuan-etal-2024-selfplay,
      title={{Self-Play Fine-Tuning of Diffusion Models for Text-to-Image Generation}}, 
      author={{Huizhuo Yuan, Zixiang Chen, Kaixuan Ji, and Quanquan Gu}},
      year={2024},
      eprint={2402.10210},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{yang-etal-2024-rpg,
      title={{Mastering Text-to-Image Diffusion: Recaptioning, Planning, and Generating with Multimodal LLMs}}, 
      author={{Ling Yang, Zhaochen Yu, Chenlin Meng, Minkai Xu, Stefano Ermon, and Bin Cui}},
      year={2024},
      eprint={2401.11708},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{li-etal-2024-playground,
      title={{Playground v2.5: Three Insights towards Enhancing Aesthetic Quality in Text-to-Image Generation}}, 
      author={{Daiqing Li, Aleks Kamko, Ehsan Akhgari, Ali Sabet, Linmiao Xu, and Suhail Doshi}},
      year={2024},
      eprint={2402.17245},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{li-etal-2024-distrifusion,
      title={{DistriFusion: Distributed Parallel Inference for High-Resolution Diffusion Models}}, 
      author={{Muyang Li, Tianle Cai, Jiaxin Cao, Qinsheng Zhang, Han Cai, Junjie Bai, Yangqing Jia, Ming-Yu Liu, Kai Li, and Song Han}},
      year={2024},
      eprint={2402.19481},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{wang-etal-2024-instancediffusion,
      title={{InstanceDiffusion: Instance-level Control for Image Generation}}, 
      author={{Xudong Wang, Trevor Darrell, Sai Saketh Rambhatla, Rohit Girdhar, and Ishan Misra}},
      year={2024},
      eprint={2402.03290},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{patel-etal-2023-eclipse,
      title={{ECLIPSE: A Resource-Efficient Text-to-Image Prior for Image Generations}}, 
      author={{Maitreya Patel, Changhoon Kim, Sheng Cheng, Chitta Baral, and Yezhou Yang}},
      year={2023},
      eprint={2312.04655},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{hu-etal-2024-instructimagen,
      title={{Instruct-Imagen: Image Generation with Multi-modal Instruction}}, 
      author={{Hexiang Hu, Kelvin C. K. Chan, Yu-Chuan Su, Wenhu Chen, Yandong Li, Kihyuk Sohn, Yang Zhao, Xue Ben, Boqing Gong, William Cohen, Ming-Wei Chang, and Xuhui Jia}},
      year={2024},
      eprint={2401.01952},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@article{cheng-etal-2023-learning,
  title={{Learning Continuous 3D Words for Text-to-Image Generation}},
  author={{Cheng, Ta-Ying, Gadelha, Matheus, Groueix, Thibault, Fisher, Matthew, Mech, Radomir, Markham, Andrew, and Trigoni, Niki}},
  booktitle={arXiv},
  year={2024}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{narasimhaswamy-etal-2024-handiffuser,
      title={{HanDiffuser: Text-to-Image Generation With Realistic Hand Appearances}}, 
      author={{Supreeth Narasimhaswamy, Uttaran Bhattacharya, Xiang Chen, Ishita Dasgupta, Saayan Mitra, and Minh Hoai}},
      year={2024},
      eprint={2403.01693},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{liang-etal-2023-rich,
      title={{Rich Human Feedback for Text-to-Image Generation}}, 
      author={{Youwei Liang, Junfeng He, Gang Li, Peizhao Li, Arseniy Klimovskiy, Nicholas Carolan, Jiao Sun, Jordi Pont-Tuset, Sarah Young, Feng Yang, Junjie Ke, Krishnamurthy Dj Dvijotham, Katie Collins, Yiwen Luo, Yang Li, Kai J Kohlhoff, Deepak Ramachandran, and Vidhya Navalpakkam}},
      year={2023},
      eprint={2312.10240},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{jayasumana-etal-2023-markovgen,
      title={{MarkovGen: Structured Prediction for Efficient Text-to-Image Generation}}, 
      author={{Sadeep Jayasumana, Daniel Glasner, Srikumar Ramalingam, Andreas Veit, Ayan Chakrabarti, and Sanjiv Kumar}},
      year={2023},
      eprint={2308.10997},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{zhou-etal-2023-customization,
      title={{Customization Assistant for Text-to-image Generation}}, 
      author={{Yufan Zhou, Ruiyi Zhang, Jiuxiang Gu, and Tong Sun}},
      year={2023},
      eprint={2312.03045},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{li-etal-2023-stylegan,
      title={{When StyleGAN Meets Stable Diffusion: a $\mathscr{W}_+$ Adapter for Personalized Image Generation}}, 
      author={{Xiaoming Li, Xinyu Hou, and Chen Change Loy}},
      year={2023},
      eprint={2311.17461},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{huang-etal-2023-learning,
      title={{Learning Disentangled Identifiers for Action-Customized Text-to-Image Generation}}, 
      author={{Siteng Huang, Biao Gong, Yutong Feng, Xi Chen, Yuqian Fu, Yu Liu, and Donglin Wang}},
      year={2023},
      eprint={2311.15841},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{xu-etal-2023-ufogen,
      title={{UFOGen: You Forward Once Large Scale Text-to-Image Generation via Diffusion GANs}}, 
      author={{Yanwu Xu, Yang Zhao, Zhisheng Xiao, and Tingbo Hou}},
      year={2023},
      eprint={2311.09257},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{li-etal-2023-selfdiscovering,
      title={{Self-Discovering Interpretable Diffusion Latent Directions for Responsible Text-to-Image Generation}}, 
      author={{Hang Li, Chengzhi Shen, Philip Torr, Volker Tresp, and Jindong Gu}},
      year={2023},
      eprint={2311.17216},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{mei-etal-2024-codi,
      title={{CoDi: Conditional Diffusion Distillation for Higher-Fidelity and Faster Image Generation}}, 
      author={{Kangfu Mei, Mauricio Delbracio, Hossein Talebi, Zhengzhong Tu, Vishal M. Patel, and Peyman Milanfar}},
      year={2024},
      eprint={2310.01407},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{ding-etal-2024-patched,
title={{Patched Denoising Diffusion Models For High-Resolution Image Synthesis}},
author={{Zheng Ding, Mengqi Zhang, Jiajun Wu, and Zhuowen Tu}},
booktitle={ICLR},
year={2024},
pages={1--18}
}

@inproceedings{teng-etal-2024-relaydiffusion,
title={Relay Diffusion: Unifying diffusion process across resolutions for image synthesis},
author={Jiayan Teng, Wendi Zheng, Ming Ding, Wenyi Hong, Jianqiao Wangni, Zhuoyi Yang, and Jie Tang},
booktitle={ICLR},
year={2024},
pages={1--18}
}

@inproceedings{podell-etal-2024-sdxl,
title={{{SDXL}: Improving Latent Diffusion Models for High-Resolution Image Synthesis}},
author={{Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas M{\"u}ller, Joe Penna, and Robin Rombach}},
booktitle={ICLR},
year={2024},
pages={1--13}
}

@misc{lee-etal-2024-composeandconquer,
      title={{Compose and Conquer: Diffusion-Based 3D Depth Aware Composable Image Synthesis}}, 
      author={{Jonghyun Lee, Hansam Cho, Youngjoon Yoo, Seoung Bum Kim, and Yonghyun Jeong}},
      year={2024},
      eprint={2401.09048},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{tan-etal-2023-semanticaware,
      title={{Semantic-aware Data Augmentation for Text-to-image Synthesis}}, 
      author={{Zhaorui Tan, Xi Yang, and Kaizhu Huang}},
      year={2023},
      eprint={2312.07951},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{cheng-etal-2024-resadapter,
      title={{ResAdapter: Domain Consistent Resolution Adapter for Diffusion Models}}, 
      author={{Jiaxiang Cheng, Pan Xie, Xin Xia, Jiashi Li, Jie Wu, Yuxi Ren, Huixia Li, Xuefeng Xiao, Min Zheng, and Lean Fu}},
      year={2024},
      eprint={2403.02084},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{fei-etal-2024-dis,
      title={{Scalable Diffusion Models with State Space Backbone}}, 
      author={{Zhengcong Fei, Mingyuan Fan, Changqian Yu, and Junshi Huang}},
      year={2024},
      eprint={2402.05608},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{wang-etal-2024-instantid,
      title={{InstantID: Zero-shot Identity-Preserving Generation in Seconds}}, 
      author={{Qixun Wang, Xu Bai, Haofan Wang, Zekui Qin, Anthony Chen, Huaxia Li, Xu Tang, and Yao Hu}},
      year={2024},
      eprint={2401.07519},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{chen-etal-2024-pixartdelta,
      title={{PIXART-{$\delta$}: Fast and Controllable Image Generation with Latent Consistency Models}}, 
      author={{Junsong Chen, Yue Wu, Simian Luo, Enze Xie, Sayak Paul, Ping Luo, Hang Zhao, and Zhenguo Li}},
      year={2024},
      eprint={2401.05252},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{chen-etal-2024-pixartalpha,
title={{PixArt-\${\textbackslash}alpha\$: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis}},
author={{Junsong Chen, Jincheng YU, Chongjian GE, Lewei Yao, Enze Xie, Zhongdao Wang, James Kwok, Ping Luo, Huchuan Lu, and Zhenguo Li}},
booktitle={ICLR},
year={2024},
pages={1--31}
}

@misc{chen-etal-2024-pixartsigma,
      title={{PixArt-$\Sigma$: Weak-to-Strong Training of Diffusion Transformer for 4K Text-to-Image Generation}}, 
      author={{Junsong Chen, Chongjian Ge, Enze Xie, Yue Wu, Lewei Yao, Xiaozhe Ren, Zhongdao Wang, Ping Luo, Huchuan Lu, and Zhenguo Li}},
      year={2024},
      eprint={2403.04692},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{zheng-etal-2024-cogview3,
      title={{CogView3: Finer and Faster Text-to-Image Generation via Relay Diffusion}}, 
      author={{Wendi Zheng, Jiayan Teng, Zhuoyi Yang, Weihan Wang, Jidong Chen, Xiaotao Gu, Yuxiao Dong, Ming Ding, and Jie Tang}},
      year={2024},
      eprint={2403.05121},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{hu-etal-2024-ella,
      title={{ELLA: Equip Diffusion Models with LLM for Enhanced Semantic Alignment}}, 
      author={{Xiwei Hu, Rui Wang, Yixiao Fang, Bin Fu, Pei Cheng, and Gang Yu}},
      year={2024},
      eprint={2403.05135},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{su-etal-2024-text2street,
      title={{Text2Street: Controllable Text-to-image Generation for Street Views}}, 
      author={{Jinming Su, Songen Gu, Yiting Duan, Xingyue Chen, and Junfeng Luo}},
      year={2024},
      eprint={2402.04504},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{zhang-etal-2024-layerdiffuse,
      title={{Transparent Image Layer Diffusion using Latent Transparency}}, 
      author={{Lvmin Zhang and Maneesh Agrawala}},
      year={2024},
      eprint={2402.17113},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{sauer-etal-2024-sd3-turbo,
      title={{Fast High-Resolution Image Synthesis with Latent Adversarial Diffusion Distillation}}, 
      author={{Axel Sauer, Frederic Boesel, Tim Dockhorn, Andreas Blattmann, Patrick Esser, and Robin Rombach}},
      year={2024},
      eprint={2403.12015},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{lee-etal-2024-streammultidiffusion,
      title={{StreamMultiDiffusion: Real-Time Interactive Generation with Region-Based Semantic Control}}, 
      author={{Jaerin Lee, Daniel Sungho Jung, Kanggeon Lee, and Kyoung Mu Lee}},
      year={2024},
      eprint={2403.09055},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{xing-etal-2024-svgdreamer,
      title={{SVGDreamer: Text Guided SVG Generation with Diffusion Model}}, 
      author={{Ximing Xing, Haitao Zhou, Chuang Wang, Jing Zhang, Dong Xu, and Qian Yu}},
      year={2024},
      eprint={2312.16476},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{kim-etal-2024-arbitraryscale,
      title={{Arbitrary-Scale Image Generation and Upsampling using Latent Diffusion Model and Implicit Neural Decoder}}, 
      author={{Jinseok Kim and Tae-Kyun Kim}},
      year={2024},
      eprint={2403.10255},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{wang-etal-2024-effective,
      title={{Towards Effective Usage of Human-Centric Priors in Diffusion Models for Text-based Human Image Generation}}, 
      author={{Junyan Wang, Zhenhong Sun, Zhiyu Tan, Xuanbai Chen, Weihua Chen, Hao Li, Cheng Zhang, and Yang Song}},
      year={2024},
      eprint={2403.05239},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{hajiali-etal-2024-elasticdiffusion,
      title={{ElasticDiffusion: Training-free Arbitrary Size Image Generation through Global-Local Content Separation}}, 
      author={{Moayed Haji-Ali, Guha Balakrishnan, and Vicente Ordonez}},
      year={2024},
      eprint={2311.18822},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{zhang-etal-2024-singdiffusion,
      title={{Tackling the Singularities at the Endpoints of Time Intervals in Diffusion Models}}, 
      author={{Pengze Zhang, Hubery Yin, Chen Li, and Xiaohua Xie}},
      year={2024},
      eprint={2403.08381},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{li-etal-2024-cosmicman,
      title={{CosmicMan: A Text-to-Image Foundation Model for Humans}}, 
      author={{Shikai Li, Jianglin Fu, Kaiyuan Liu, Wentao Wang, Kwan-Yee Lin, and Wayne Wu}},
      year={2024},
      eprint={2404.01294},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{zhang-etal-2024-panfusion,
      title={{Taming Stable Diffusion for Text to 360{\deg} Panorama Image Generation}}, 
      author={{Cheng Zhang, Qianyi Wu, Camilo Cruz Gambardella, Xiaoshui Huang, Dinh Phung, Wanli Ouyang, and Jianfei Cai}},
      year={2024},
      eprint={2404.07949},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{liu-etal-2024-intelligent-grimm,
      title={{Intelligent Grimm -- Open-ended Visual Storytelling via Latent Diffusion Models}}, 
      author={{Chang Liu, Haoning Wu, Yujie Zhong, Xiaoyun Zhang, Yanfeng Wang, and Weidi Xie}},
      year={2024},
      eprint={2306.00973},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{zhou-etal-2024-storydiffusion,
      title={{StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation}}, 
      author={{Yupeng Zhou, Daquan Zhou, Ming-Ming Cheng, Jiashi Feng, Qibin Hou}},
      year={2024},
      eprint={2405.01434},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{li-etal-2024-hunyuandit,
      title={Hunyuan-DiT: A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding}, 
      author={{Zhimin Li, Jianwei Zhang, Qin Lin, Jiangfeng Xiong, Yanxin Long, Xinchi Deng, Yingfang Zhang, Xingchao Liu, Minbin Huang, Zedong Xiao, Dayou Chen, Jiajun He, Jiahao Li, Wenyue Li, Chen Zhang, Rongwei Quan, Jianxiang Lu, Jiabin Huang, Xiaoyan Yuan, Xiaoxiao Zheng, Yixuan Li, Jihong Zhang, Chao Zhang, Meng Chen, Jie Liu, Zheng Fang, Weiyan Wang, Jinbao Xue, Yangyu Tao, Jianchen Zhu, Kai Liu, Sihuan Lin, Yifu Sun, Yun Li, Dongdong Wang, Mingtao Chen, Zhichao Hu, Xiao Xiao, Yan Chen, Yuhong Liu, Wei Liu, Di Wang, Yong Yang, Jie Jiang, and Qinglin Lu}},
      year={2024},
      eprint={2405.08748},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{huang-etal-2024-dialoggen,
      title={{DialogGen: Multi-modal Interactive Dialogue System for Multi-turn Text-to-Image Generation}}, 
      author={{Minbin Huang, Yanxin Long, Xinchi Deng, Ruihang Chu, Jiangfeng Xiong, Xiaodan Liang, Hong Cheng, Qinglin Lu, and Wei Liu}},
      year={2024},
      eprint={2403.08857},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{han-etal-2024-faceadapter,
      title={{Face Adapter for Pre-Trained Diffusion Models with Fine-Grained ID and Attribute Control}}, 
      author={{Yue Han, Junwei Zhu, Keke He, Xu Chen, Yanhao Ge, Wei Li, Xiangtai Li, Jiangning Zhang, Chengjie Wang, and Yong Liu}},
      year={2024},
      eprint={2405.12970},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{li-etal-2024-scalability,
      title={{On the Scalability of Diffusion-based Text-to-Image Generation}}, 
      author={Hao Li and Yang Zou and Ying Wang and Orchid Majumder and Yusheng Xie and R. Manmatha and Ashwin Swaminathan and Zhuowen Tu and Stefano Ermon and Stefano Soatto},
      year={2024},
      eprint={2404.02883},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{tudosiu-etal-2024-mulan,
      title={{MULAN: A Multi Layer Annotated Dataset for Controllable Text-to-Image Generation}}, 
      author={Petru-Daniel Tudosiu and Yongxin Yang and Shifeng Zhang and Fei Chen and Steven McDonagh and Gerasimos Lampouras and Ignacio Iacobacci and Sarah Parisot},
      year={2024},
      eprint={2404.02790},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{zhang-etal-2024-learning,
      title={{Learning Multi-dimensional Human Preference for Text-to-Image Generation}}, 
      author={Sixian Zhang and Bohan Wang and Junqiang Wu and Yan Li and Tingting Gao and Di Zhang and Zhongyuan Wang},
      year={2024},
      eprint={2405.14705},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{mo-etal-2024-dynamic,
      title={{Dynamic Prompt Optimizing for Text-to-Image Generation}}, 
      author={Wenyi Mo and Tianyu Zhang and Yalong Bai and Bing Su and Ji-Rong Wen and Qing Yang},
      year={2024},
      eprint={2404.04095},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{ding-etal-2024-freecustom,
      title={{FreeCustom: Tuning-Free Customized Image Generation for Multi-Concept Composition}}, 
      author={Ganggui Ding and Canyu Zhao and Wen Wang and Zhen Yang and Zide Liu and Hao Chen and Chunhua Shen},
      year={2024},
      eprint={2405.13870},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% TODO: Missing reference of the paper `Training Diffusion Models Towards Diverse Image Generation with Reinforcement Learning'

% TODO: Missing reference of the paper `Adversarial Text to Continuous Image Generation'

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@misc{zhao-etal-2024-bridging,
      title={{Bridging Different Language Models and Generative Vision Models for Text-to-Image Generation}}, 
      author={Shihao Zhao and Shaozhe Hao and Bojia Zi and Huaizhe Xu and Kwan-Yee K. Wong},
      year={2024},
      eprint={2403.07860},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@misc{yang-etal-2024-exploring,
      title={{Exploring Phrase-Level Grounding with Text-to-Image Diffusion Model}}, 
      author={Danni Yang and Ruohan Dong and Jiayi Ji and Yiwei Ma and Haowei Wang and Xiaoshuai Sun and Rongrong Ji},
      year={2024},
      eprint={2407.05352},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@misc{chatterjee-etal-2024-getting,
      title={{Getting it Right: Improving Spatial Consistency in Text-to-Image Models}}, 
      author={Agneet Chatterjee and Gabriela Ben Melech Stan and Estelle Aflalo and Sayak Paul and Dhruba Ghosh and Tejas Gokhale and Ludwig Schmidt and Hannaneh Hajishirzi and Vasudev Lal and Chitta Baral and Yezhou Yang},
      year={2024},
      eprint={2404.01197},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@misc{mittal-etal-2024-navigating,
      title={{Navigating Text-to-Image Generative Bias across Indic Languages}}, 
      author={Surbhi Mittal and Arnav Sudan and Mayank Vatsa and Richa Singh and Tamar Glaser and Tal Hassner},
      year={2024},
      eprint={2408.00283},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@misc{kim-etal-2024-safeguard,
      title={{Safeguard Text-to-Image Diffusion Models with Human Feedback Inversion}}, 
      author={Sanghyun Kim and Seohyeon Jung and Balhae Kim and Moonseok Choi and Jinwoo Shin and Juho Lee},
      year={2024},
      eprint={2407.21032},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@misc{yao-etal-2024-fabrication,
      title={{The Fabrication of Reality and Fantasy: Scene Generation with LLM-Assisted Prompt Interpretation}}, 
      author={Yi Yao and Chan-Feng Hsu and Jhe-Hao Lin and Hongxia Xie and Terence Lin and Yi-Ning Huang and Hong-Han Shuai and Wen-Huang Cheng},
      year={2024},
      eprint={2407.12579},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@misc{gong-etal-2024-reliable,
      title={{Reliable and Efficient Concept Erasure of Text-to-Image Diffusion Models}}, 
      author={Chao Gong and Kai Chen and Zhipeng Wei and Jingjing Chen and Yu-Gang Jiang},
      year={2024},
      eprint={2407.12383},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@misc{yang-etal-2024-exploring,
      title={{Exploring Phrase-Level Grounding with Text-to-Image Diffusion Model}}, 
      author={Danni Yang and Ruohan Dong and Jiayi Ji and Yiwei Ma and Haowei Wang and Xiaoshuai Sun and Rongrong Ji},
      year={2024},
      eprint={2407.05352},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

@misc{liu-etal-2024-linfusion,
      title={{LinFusion: 1 GPU, 1 Minute, 16K Image}}, 
      author={Songhua Liu and Weihao Yu and Zhenxiong Tan and Xinchao Wang},
      year={2024},
      eprint={2409.02097},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@misc{li-etal-2024-styletokenizer,
      title={{StyleTokenizer: Defining Image Style by a Single Instance for Controlling Diffusion Models}}, 
      author={Wen Li and Muyuan Fang and Cheng Zou and Biao Gong and Ruobing Zheng and Meng Wang and Jingdong Chen and Ming Yang},
      year={2024},
      eprint={2409.02543},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

@misc{xiao-etal-2024-omnigen,
      title={{OmniGen: Unified Image Generation}}, 
      author={Shitao Xiao and Yueze Wang and Junjie Zhou and Huaying Yuan and Xingrun Xing and Ruiran Yan and Shuting Wang and Tiejun Huang and Zheng Liu},
      year={2024},
      eprint={2409.11340},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

@inproceedings{yang-etal-2024-emogen,
  author       = {Jingyuan Yang and
                  Jiawei Feng and
                  Hui Huang},
  title        = {{EmoGen: Emotional Image Content Generation with Text-to-Image Diffusion
                  Models}},
  booktitle    = {CVPR},
  pages        = {6358--6368},
  year         = {2024},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@article{ma-etal-2024-peadiffusion,
  author       = {Jian Ma and
                  Chen Chen and
                  Qingsong Xie and
                  Haonan Lu},
  title        = {{PEA-Diffusion: Parameter-Efficient Adapter with Knowledge Distillation
                  in non-English Text-to-Image Generation}},
  journal      = {CoRR},
  volume       = {abs/2311.17086},
  year         = {2023},
}


@inproceedings{lee-etal-2024-parrot,
  author       = {Seung Hyun Lee and
                  Yinxiao Li and
                  Junjie Ke and
                  Innfarn Yoo and
                  Han Zhang and
                  Jiahui Yu and
                  Qifei Wang and
                  Fei Deng and
                  Glenn Entis and
                  Junfeng He and
                  Gang Li and
                  Sangpil Kim and
                  Irfan Essa and
                  Feng Yang},
  title        = {{Parrot: Pareto-Optimal Multi-reward Reinforcement Learning Framework
                  for Text-to-Image Generation}},
  booktitle    = {ECCV},
  pages        = {462--478},
  year         = {2024},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@article{chang-etal-2024-skews,
  author       = {Yingshan Chang and
                  Yasi Zhang and
                  Zhiyuan Fang and
                  Yingnian Wu and
                  Yonatan Bisk and
                  Feng Gao},
  title        = {{Skews in the Phenomenon Space Hinder Generalization in Text-to-Image
                  Generation}},
  journal      = {CoRR},
  volume       = {abs/2403.16394},
  year         = {2024},
}

% TODO: Missing reference for the paper `MasterWeaver: Taming Editability and Face Identity for Personalized Text-to-Image Generation'

@article{zhao-etal-2024-bridging,
  author       = {Shihao Zhao and
                  Shaozhe Hao and
                  Bojia Zi and
                  Huaizhe Xu and
                  Kwan{-}Yee K. Wong},
  title        = {{Bridging Different Language Models and Generative Vision Models for
                  Text-to-Image Generation}},
  journal      = {CoRR},
  volume       = {abs/2403.07860},
  year         = {2024},
}

% TODO: Missing reference for the paper `MobileDiffusion: Instant Text-to-Image Generation on Mobile Devices'

@inproceedings{kim-etal-2023-densediffusion,
  author       = {Yunji Kim and
                  Jiyoung Lee and
                  Jin{-}Hwa Kim and
                  Jung{-}Woo Ha and
                  Jun{-}Yan Zhu},
  title        = {Dense Text-to-Image Generation with Attention Modulation},
  booktitle    = {CVPR},
  pages        = {7667--7677},
  year         = {2023},
}

@misc{zhang-etal-2024-compass,
      title={{CoMPaSS: Enhancing Spatial Understanding in Text-to-Image Diffusion Models}}, 
      author={Gaoyang Zhang and Bingtao Fu and Qingnan Fan and Qi Zhang and Runxing Liu and Hong Gu and Huaqi Zhang and Xinguo Liu},
      year={2024},
      eprint={2412.13195},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% ----------------CONDITIONAL TEXT-TO-IMAGE GENERATION---------------------
@inproceedings{meng-etal-2022-sdedit,
  author       = {{Chenlin Meng,
                  Yutong He,
                  Yang Song,
                  Jiaming Song,
                  Jiajun Wu,
                  Jun{-}Yan Zhu, and
                  Stefano Ermon}},
  title        = {{SDEdit: Guided Image Synthesis and Editing with Stochastic Differential
                  Equations}},
  booktitle    = {ICLR},
  year         = {2022},
}

@article{wang-etal-2023-piti,
  author       = {{Tengfei Wang,
                  Ting Zhang,
                  Bo Zhang,
                  Hao Ouyang,
                  Dong Chen,
                  Qifeng Chen, and
                  Fang Wen}},
  title        = {{Pretraining is All You Need for Image-to-Image Translation}},
  year         = {2022},
}

@inproceedings{li-etal-2023-gligen,
  author       = {{Yuheng Li,
                  Haotian Liu,
                  Qingyang Wu,
                  Fangzhou Mu,
                  Jianwei Yang,
                  Jianfeng Gao,
                  Chunyuan Li, and
                  Yong Jae Lee}},
  title        = {{{GLIGEN:} Open-Set Grounded Text-to-Image Generation}},
  booktitle    = {CVPR},
  pages        = {22511--22521},
  year         = {2023},
}

@inproceedings{zhang-etal-2023-controlnet,
  author       = {{Lvmin Zhang,
                  Anyi Rao, and
                  Maneesh Agrawala}},
  title        = {{Adding Conditional Control to Text-to-Image Diffusion Models}},
  booktitle    = {ICCV},
  pages        = {3813--3824},
  year         = {2023},
}

@article{mou-etal-2023-t2i-adapter,
  author       = {{Chong Mou,
                  Xintao Wang,
                  Liangbin Xie,
                  Jian Zhang,
                  Zhongang Qi,
                  Ying Shan, and
                  Xiaohu Qie}},
  title        = {{T2I-Adapter: Learning Adapters to Dig out More Controllable Ability
                  for Text-to-Image Diffusion Models}},
  journal      = {CoRR},
  year         = {2023},
}

@inproceedings{huang-etal-2023-composer,
  author       = {{Lianghua Huang,
                  Di Chen,
                  Yu Liu,
                  Yujun Shen,
                  Deli Zhao, and
                  Jingren Zhou}},
  title        = {{Composer: Creative and Controllable Image Synthesis with Composable
                  Conditions}},
  booktitle    = {ICML},
  pages        = {13753--13773},
  year         = {2023},
}

@inproceedings{voynov-etal-2023-sketch,
  author       = {{Andrey Voynov,
                  Kfir Aberman, and
                  Daniel Cohen{-}Or}},
  title        = {{Sketch-Guided Text-to-Image Diffusion Models}},
  booktitle    = {SIGGRAPH},
  pages        = {55:1--55:11},
  year         = {2023},
}

@inproceedings{bartal-etal-2023-multidiffusion,
  author       = {{Omer Bar{-}Tal,
                  Lior Yariv,
                  Yaron Lipman, and
                  Tali Dekel}},
  title        = {{MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation}},
  booktitle    = {ICML},
  pages        = {1737--1752},
  year         = {2023},}
}

@inproceedings{avrahami-etal-2023-spatext,
  author       = {{Omri Avrahami,
                  Thomas Hayes,
                  Oran Gafni,
                  Sonal Gupta,
                  Yaniv Taigman,
                  Devi Parikh,
                  Dani Lischinski,
                  Ohad Fried, and
                  Xi Yin}},
  title        = {{SpaText: Spatio-Textual Representation for Controllable Image Generation}},
  booktitle    = {CVPR},
  pages        = {18370--18380},
  year         = {2023},
}

@misc{zhao-etal-2023-unicontrolnet,
      title={{Uni-ControlNet: All-in-One Control to Text-to-Image Diffusion Models}}, 
      author={{Shihao Zhao, Dongdong Chen, Yen-Chun Chen, Jianmin Bao, Shaozhe Hao, Lu Yuan, and Kwan-Yee K. Wong}},
      year={2023},
      eprint={2305.16322},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{wang-etal-2023-incontext,
title={{In-Context Learning Unlocked for Diffusion Models}},
author={{Zhendong Wang, Yifan Jiang, Yadong Lu, yelong shen, Pengcheng He, Weizhu Chen, Zhangyang Wang, and Mingyuan Zhou}},
booktitle={NeurIPS},
year={2023},
}

@inproceedings{liu-etal-2023-more,
  author       = {{Xihui Liu,
                  Dong Huk Park,
                  Samaneh Azadi,
                  Gong Zhang,
                  Arman Chopikyan,
                  Yuxiao Hu,
                  Humphrey Shi,
                  Anna Rohrbach, and
                  Trevor Darrell}},
  title        = {{More Control for Free! Image Synthesis with Semantic Diffusion Guidance}},
  booktitle    = {WACV},
  pages        = {289--299},
  year         = {2023},
}

@inproceedings{yang-etal-2023-reco,
  author       = {{Zhengyuan Yang,
                  Jianfeng Wang,
                  Zhe Gan,
                  Linjie Li,
                  Kevin Lin,
                  Chenfei Wu,
                  Nan Duan,
                  Zicheng Liu,
                  Ce Liu,
                  Michael Zeng, and
                  Lijuan Wang}},
  title        = {{ReCo: Region-Controlled Text-to-Image Generation}},
  booktitle    = {CVPR},
  pages        = {14246--14255},
  year         = {2023},
}

@inproceedings{farshad-etal-2023-scenegenie,
  author       = {{Azade Farshad,
                  Yousef Yeganeh,
                  Yu Chi,
                  Chengzhi Shen,
                  Bj{\"{o}}rn Ommer, and
                  Nassir Navab}},
  title        = {{SceneGenie: Scene Graph Guided Diffusion Models for Image Synthesis}},
  booktitle    = {ICCV},
  pages        = {88--98},
  year         = {2023},
}

@misc{li-etal-2023-blipdiffusion,
      title={{BLIP-Diffusion: Pre-trained Subject Representation for Controllable Text-to-Image Generation and Editing}}, 
      author={{Dongxu Li, Junnan Li, and Steven C. H. Hoi}},
      year={2023},
      eprint={2305.14720},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{qu-etal-2023-layoutllm-t2i,
  author       = {{Leigang Qu,
                  Shengqiong Wu,
                  Hao Fei,
                  Liqiang Nie, and
                  Tat{-}Seng Chua}},
  title        = {{LayoutLLM-T2I: Eliciting Layout Guidance from {LLM} for Text-to-Image
                  Generation}},
  booktitle    = {ACM MM},
  pages        = {643--654},
  year         = {2023},
}

@inproceedings{feng-etal-2023-training,
  author       = {{Weixi Feng,
                  Xuehai He,
                  Tsu{-}Jui Fu,
                  Varun Jampani,
                  Arjun R. Akula,
                  Pradyumna Narayana,
                  Sugato Basu,
                  Xin Eric Wang, and
                  William Yang Wang}},
  title        = {{Training-Free Structured Diffusion Guidance for Compositional Text-to-Image
                  Synthesis}},
  booktitle    = {ICLR},
  year         = {2023},
}

@inproceedings{zheng-etal-2023-layoutdiffusion,
  author       = {{Guangcong Zheng,
                  Xianpan Zhou,
                  Xuewei Li,
                  Zhongang Qi,
                  Ying Shan, and
                  Xi Li}},
  title        = {{LayoutDiffusion: Controllable Diffusion Model for Layout-to-Image
                  Generation}},
  booktitle    = {CVPR},
  pages        = {22490--22499},
  year         = {2023},
}

@inproceedings{couairon-etal-2023-zestguide,
  author       = {{Guillaume Couairon,
                  Marl{\`{e}}ne Careil,
                  Matthieu Cord,
                  St{\'{e}}phane Lathuili{\`{e}}re, and
                  Jakob Verbeek}},
  title        = {{Zero-shot spatial layout conditioning for text-to-image diffusion
                  models}},
  booktitle    = {ICCV},
  pages        = {2174--2183},
  year         = {2023},
}

@article{ye-etal-2023-ipadapter,
  author       = {{Hu Ye,
                  Jun Zhang,
                  Sibo Liu,
                  Xiao Han, and
                  Wei Yang}},
  title        = {{IP-Adapter: Text Compatible Image Prompt Adapter for Text-to-Image
                  Diffusion Models}},
  journal      = {CoRR},
  volume       = {abs/2308.06721},
  year         = {2023},
}

@misc{liu-etal-2023-lateconstraint,
      title={{Late-Constraint Diffusion Guidance for Controllable Image Synthesis}}, 
      author={{Chang Liu and Dong Liu}},
      year={2023},
      eprint={2305.11520},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{chen-etal-2024-training,
  author       = {{Minghao Chen,
                  Iro Laina, and
                  Andrea Vedaldi}},
  title        = {{Training-Free Layout Control with Cross-Attention Guidance}},
  journal      = {CoRR},
  volume       = {abs/2304.03373},
  year         = {2024},
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{lv-etal-2024-place,
      title={{PLACE: Adaptive Layout-Semantic Fusion for Semantic Image Synthesis}}, 
      author={{Zhengyao Lv, Yuxiang Wei, Wangmeng Zuo, and Kwan-Yee K. Wong}},
      year={2024},
      eprint={2403.01852},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{cho-etal-2024-oneshot,
      title={{One-Shot Structure-Aware Stylized Image Synthesis}}, 
      author={{Hansam Cho, Jonghyun Lee, Seunggyu Chang, and Yonghyun Jeong}},
      year={2024},
      eprint={2402.17275},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{phung-etal-2023-grounded,
      title={{Grounded Text-to-Image Synthesis with Attention Refocusing}}, 
      author={{Quynh Phung, Songwei Ge, and Jia-Bin Huang}},
      year={2023},
      eprint={2306.05427},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{lu-etal-2024-coarsetofine,
      title={{Coarse-to-Fine Latent Diffusion for Pose-Guided Person Image Synthesis}}, 
      author={{Yanzuo Lu, Manlin Zhang, Andy J Ma, Xiaohua Xie, and Jian-Huang Lai}},
      year={2024},
      eprint={2402.18078},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{shen-etal-2024-advancing,
title={{Advancing Pose-Guided Image Synthesis with Progressive Conditional Diffusion Models}},
author={{Fei Shen, Hu Ye, Jun Zhang, Cong Wang, Xiao Han, and Yang Wei}},
booktitle={ICLR},
year={2024},
pages={1--19}
}

@inproceedings{jia-etal-2024-ssmg,
  author       = {{Chengyou Jia,
                  Minnan Luo,
                  Zhuohang Dang,
                  Guang Dai,
                  Xiaojun Chang,
                  Mengmeng Wang, and
                  Jingdong Wang}},
  title        = {{{SSMG:} Spatial-Semantic Map Guided Diffusion Model for Free-Form
                  Layout-to-Image Generation}},
  booktitle    = {AAAI},
  pages        = {2480--2488},
  year         = {2024},
}

@inproceedings{wang-etal-2024-compositional,
  author       = {{Ruichen Wang,
                  Zekang Chen,
                  Chen Chen,
                  Jian Ma,
                  Haonan Lu, and
                  Xiaodong Lin}},
  title        = {{Compositional Text-to-Image Synthesis with Attention Map Control of
                  Diffusion Models}},
  booktitle    = {AAAI},
  pages        = {5544--5552},
  year         = {2024},
}

% NOTE: This reference will be replaced with AAAI version as soon as the metadata is updated
@misc{qi-etal-2024-deadiff,
      title={{DEADiff: An Efficient Stylization Diffusion Model with Disentangled Representations}}, 
      author={{Tianhao Qi, Shancheng Fang, Yanze Wu, Hongtao Xie, Jiawei Liu, Lang Chen, Qian He, and Yongdong Zhang}},
      year={2024},
      eprint={2403.06951},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{jiang-etal-2024-comat,
      title={{CoMat: Aligning Text-to-Image Diffusion Model with Image-to-Text Concept Matching}}, 
      author={{Dongzhi Jiang, Guanglu Song, Xiaoshi Wu, Renrui Zhang, Dazhong Shen, Zhuofan Zong, Yu Liu, and Hongsheng Li}},
      year={2024},
      eprint={2404.03653},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{wang-etal-2024-detdiffusion,
      title={{DetDiffusion: Synergizing Generative and Perceptive Models for Enhanced Data Generation and Perception}}, 
      author={{Yibo Wang, Ruiyuan Gao, Kai Chen, Kaiqiang Zhou, Yingjie Cai, Lanqing Hong, Zhenguo Li, Lihui Jiang, Dit-Yan Yeung, Qiang Xu, and Kai Zhang}},
      year={2024},
      eprint={2403.13304},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{li-etal-2024-controlnet-plus-plus,
      title={{ControlNet++: Improving Conditional Controls with Efficient Consistency Feedback}}, 
      author={{Ming Li, Taojiannan Yang, Huafeng Kuang, Jie Wu, Zhaoning Wang, Xuefeng Xiao, and Chen Chen}},
      year={2024},
      eprint={2404.07987},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{cai-etal-2024-can,
      title={{Condition-Aware Neural Network for Controlled Image Generation}}, 
      author={{Han Cai, Muyang Li, Zhuoyang Zhang, Qinsheng Zhang, Ming-Yu Liu, and Song Han}},
      year={2024},
      eprint={2404.01143},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{ren-etal-2024-layered,
      title={{Move Anything with Layered Scene Diffusion}}, 
      author={{Jiawei Ren, Mengmeng Xu, Jui-Chieh Wu, Ziwei Liu, Tao Xiang, and Antoine Toisoul}},
      year={2024},
      eprint={2404.07178},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{ohanyan-etal-2024-zeropainter,
      title={{Zero-Painter: Training-Free Layout Control for Text-to-Image Synthesis}}, 
      author={Marianna Ohanyan and Hayk Manukyan and Zhangyang Wang and Shant Navasardyan and Humphrey Shi},
      year={2024},
      eprint={2406.04032},
      archivePrefix={arXiv},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@misc{parihar-etal-2024-precisecontrol,
      title={{PreciseControl: Enhancing Text-To-Image Diffusion Models with Fine-Grained Attribute Control}}, 
      author={Rishubh Parihar and Sachidanand VS and Sabariswaran Mani and Tejan Karmali and R. Venkatesh Babu},
      year={2024},
      eprint={2408.05083},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

@article{zhou-etal-2024-migc,
  author       = {Dewei Zhou and
                  You Li and
                  Fan Ma and
                  Zongting Zhang and
                  Yi Yang},
  title        = {{{MIGC:} Multi-Instance Generation Controller for Text-to-Image Synthesis}},
  journal      = {CoRR},
  volume       = {abs/2402.05408},
  year         = {2024},
}

@misc{peng-etal-2024-controlnext,
      title={{ControlNeXt: Powerful and Efficient Control for Image and Video Generation}}, 
      author={Bohao Peng and Jian Wang and Yuechen Zhang and Wenbo Li and Ming-Chang Yang and Jiaya Jia},
      year={2024},
      eprint={2408.06070},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

@misc{he-etal-2024-uniportrait,
      title={{UniPortrait: A Unified Framework for Identity-Preserving Single- and Multi-Human Image Personalization}}, 
      author={Junjie He and Yifeng Geng and Liefeng Bo},
      year={2024},
      eprint={2408.05939},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@article{mo-etal-2024-freecontrol,
  author       = {Sicheng Mo and
                  Fangzhou Mu and
                  Kuan Heng Lin and
                  Yanli Liu and
                  Bochen Guan and
                  Yin Li and
                  Bolei Zhou},
  title        = {{FreeControl: Training-Free Spatial Control of Any Text-to-Image Diffusion
                  Model with Any Condition}},
  journal      = {CoRR},
  volume       = {abs/2312.07536},
  year         = {2023},
}

@article{liu-etal-2024-unziplora,
      title={{UnZipLoRA: Separating Content and Style from a Single Image}}, 
      author={Chang Liu and Viraj Shah and Aiyu Cui and Svetlana Lazebnik},
      year={2024},
      eprint={2412.04465},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

@article{xu-etal-2024-ctrllora,
      title={{CtrLoRA: An Extensible and Efficient Framework for Controllable Image Generation}}, 
      author={Yifeng Xu and Zhenliang He and Shiguang Shan and Xilin Chen},
      year={2024},
      eprint={2410.09400},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

@misc{chen-etal-2024-regionaware,
      title={{Region-Aware Text-to-Image Generation via Hard Binding and Soft Refinement}}, 
      author={Zhennan Chen and Yajie Li and Haofan Wang and Zhibo Chen and Zhengkai Jiang and Jun Li and Qian Wang and Jian Yang and Ying Tai},
      year={2024},
      eprint={2411.06558},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% ----------------PERSONALIZED TEXT-TO-IMAGE GENERATION---------------------
@inproceedings{kumari-etal-2023-custom-diffusion,
  author       = {{Nupur Kumari,
                  Bingliang Zhang,
                  Richard Zhang,
                  Eli Shechtman, and
                  Jun{-}Yan Zhu}},
  title        = {{Multi-Concept Customization of Text-to-Image Diffusion}},
  booktitle    = {CVPR},
  pages        = {1931--1941},
  year         = {2023},
}

@inproceedings{ruiz-etal-2023-dreambooth,
  author       = {{Nataniel Ruiz,
                  Yuanzhen Li,
                  Varun Jampani,
                  Yael Pritch,
                  Michael Rubinstein, and
                  Kfir Aberman}},
  title        = {{DreamBooth: Fine Tuning Text-to-Image Diffusion Models for Subject-Driven
                  Generation}},
  booktitle    = {CVPR},
  pages        = {22500--22510},
  year         = {2023},
}

@inproceedings{wei-etal-2023-elite,
  author       = {{Yuxiang Wei,
                  Yabo Zhang,
                  Zhilong Ji,
                  Jinfeng Bai,
                  Lei Zhang, and
                  Wangmeng Zuo}},
  title        = {{{ELITE:} Encoding Visual Concepts into Textual Embeddings for Customized
                  Text-to-Image Generation}},
  booktitle    = {ICCV},
  pages        = {15897--15907},
  year         = {2023},
}

@inproceedings{gal-etal-2023-textual-inversion,
  author       = {{Rinon Gal,
                  Yuval Alaluf,
                  Yuval Atzmon,
                  Or Patashnik,
                  Amit Haim Bermano,
                  Gal Chechik, and
                  Daniel Cohen{-}Or}},
  title        = {{An Image is Worth One Word: Personalizing Text-to-Image Generation
                  using Textual Inversion}},
  booktitle    = {ICLR},
  year         = {2023},
}

@inproceedings{avrahami-etal-2023-break-a-scene,
  author       = {{Omri Avrahami,
                  Kfir Aberman,
                  Ohad Fried,
                  Daniel Cohen{-}Or, and
                  Dani Lischinski}},
  title        = {{Break-A-Scene: Extracting Multiple Concepts from a Single Image}},
  booktitle    = {SIGGRAPH},
  pages        = {96:1--96:12},
  year         = {2023},
}

@article{shi-etal-2023-instantbooth,
  author       = {{Jing Shi,
                  Wei Xiong,
                  Zhe Lin, and
                  Hyun Joon Jung}},
  title        = {{InstantBooth: Personalized Text-to-Image Generation without Test-Time
                  Finetuning}},
  journal      = {CoRR},
  volume       = {abs/2304.03411},
  year         = {2023},
}

@article{gal-etal-2023-encoder,
  author       = {{Rinon Gal,
                  Moab Arar,
                  Yuval Atzmon,
                  Amit H. Bermano,
                  Gal Chechik, and
                  Daniel Cohen{-}Or}},
  title        = {{Encoder-based Domain Tuning for Fast Personalization of Text-to-Image
                  Models}},
  journal      = {{ACM} Trans. Graph.},
  volume       = {42},
  number       = {4},
  pages        = {150:1--150:13},
  year         = {2023},
}

@misc{li-etal-2023-photomaker,
      title={{PhotoMaker: Customizing Realistic Human Photos via Stacked ID Embedding}}, 
      author={{Zhen Li, Mingdeng Cao, Xintao Wang, Zhongang Qi, Ming-Ming Cheng, and Ying Shan}},
      year={2023},
      eprint={2312.04461},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{li-etal-2023-layerdiffusion,
author = {{Li, Pengzhi, Huang, Qinxuan, Ding, Yikang, and Li, Zhiheng}},
title = {{LayerDiffusion: Layered Controlled Image Editing with Diffusion Models}},
year = {2023},
booktitle = {SIGGRAPH},
}

@misc{xiao-etal-2023-fastcomposer,
      title={{FastComposer: Tuning-Free Multi-Subject Image Generation with Localized Attention}}, 
      author={{Guangxuan Xiao, Tianwei Yin, William T. Freeman, Frédo Durand, and Song Han}},
      year={2023},
      eprint={2305.10431},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{pang-etal-2023-cross,
      title={{Cross Initialization for Personalized Text-to-Image Generation}}, 
      author={{Lianyu Pang, Jian Yin, Haoran Xie, Qiping Wang, Qing Li, and Xudong Mao}},
      year={2023},
      eprint={2312.15905},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{hertz-etal-2024-style,
      title={{Style Aligned Image Generation via Shared Attention}}, 
      author={{Amir Hertz, Andrey Voynov, Shlomi Fruchter, and Daniel Cohen-Or}},
      year={2024},
      eprint={2312.02133},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{wang-etal-2024-highfidelity,
      title={{High-fidelity Person-centric Subject-to-Image Synthesis}}, 
      author={{Yibin Wang, Weizhong Zhang, Jianwei Zheng, and Cheng Jin}},
      year={2024},
      eprint={2311.10329},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{cai-etal-2024-decoupled,
  author       = {{Yufei Cai,
                  Yuxiang Wei,
                  Zhilong Ji,
                  Jinfeng Bai,
                  Hu Han, and
                  Wangmeng Zuo}},
  title        = {{Decoupled Textual Embeddings for Customized Image Generation}},
  booktitle    = {AAAI},
  pages        = {909--917},
  year         = {2024},
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{huang-etal-2024-realcustom,
      title={{RealCustom: Narrowing Real Text Word for Real-Time Open-Domain Text-to-Image Customization}}, 
      author={{Mengqi Huang, Zhendong Mao, Mingcong Liu, Qian He, and Yongdong Zhang}},
      year={2024},
      eprint={2403.00483},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{wang-etal-2024-instantstyle,
      title={{InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image Generation}}, 
      author={{Haofan Wang, Qixun Wang, Xu Bai, Zekui Qin, and Anthony Chen}},
      year={2024},
      eprint={2404.02733},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{zhang-etal-2024-flashface,
      title={{FlashFace: Human Image Personalization with High-fidelity Identity Preservation}}, 
      author={{Shilong Zhang, Lianghua Huang, Xi Chen, Yifei Zhang, Zhi-Fan Wu, Yutong Feng, Wei Wang, Yujun Shen, Yu Liu, and Ping Luo}},
      year={2024},
      eprint={2403.17008},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{song-etal-2024-moma,
      title={{MoMA: Multimodal LLM Adapter for Fast Personalized Image Generation}}, 
      author={{Kunpeng Song, Yizhe Zhu, Bingchen Liu, Qing Yan, Ahmed Elgammal, and Xiao Yang}},
      year={2024},
      eprint={2404.05674},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{cui-etal-2024-idadapter,
      title={{IDAdapter: Learning Mixed Features for Tuning-Free Personalization of Text-to-Image Models}}, 
      author={{Siying Cui, Jia Guo, Xiang An, Jiankang Deng, Yongle Zhao, Xinyu Wei, and Ziyong Feng}},
      year={2024},
      eprint={2403.13535},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{zhang-etal-2024-disendiff,
      title={{Attention Calibration for Disentangled Text-to-Image Personalization}}, 
      author={{Yanbing Zhang, Mengping Yang, Qin Zhou, and Zhe Wang}},
      year={2024},
      eprint={2403.18551},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{ham-etal-2024-personalized,
      title={{Personalized Residuals for Concept-Driven Text-to-Image Generation}}, 
      author={Cusuh Ham and Matthew Fisher and James Hays and Nicholas Kolkin and Yuchen Liu and Richard Zhang and Tobias Hinz},
      year={2024},
      eprint={2405.12978},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{chan-etal-2024-improving,
      title={{Improving Subject-Driven Image Synthesis with Subject-Agnostic Guidance}}, 
      author={Kelvin C. K. Chan and Yang Zhao and Xuhui Jia and Ming-Hsuan Yang and Huisheng Wang},
      year={2024},
      eprint={2405.01356},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% TODO: Missing reference for the paper `JeDi: Joint-Image Diffusion Models for Finetuning-Free Personalized Text-to-Image Generation'

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@misc{dahary-etal-2024-beyourself,
      title={{Be Yourself: Bounded Attention for Multi-Subject Text-to-Image Generation}}, 
      author={Omer Dahary and Or Patashnik and Kfir Aberman and Daniel Cohen-Or},
      year={2024},
      eprint={2403.16990},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@misc{wei-etal-2024-powerful,
      title={{Powerful and Flexible: Personalized Text-to-Image Generation via Reinforcement Learning}}, 
      author={Fanyue Wei and Wei Zeng and Zhenyang Li and Dawei Yin and Lixin Duan and Wen Li},
      year={2024},
      eprint={2407.06642},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

@InProceedings{liu-etal-2024-countering,
    author    = {Liu, Hanwen and Sun, Zhicheng and Mu, Yadong},
    title     = {{Countering Personalized Text-to-Image Generation with Influence Watermarks}},
    booktitle = {CVPR},
    month     = {June},
    year      = {2024},
    pages     = {12257-12267}
}

@misc{wu-etal-2024-core,
      title={{CoRe: Context-Regularized Text Embedding Learning for Text-to-Image Personalization}}, 
      author={Feize Wu and Yun Pang and Junyi Zhang and Lianyu Pang and Jian Yin and Baoquan Zhao and Qing Li and Xudong Mao},
      year={2024},
      eprint={2408.15914},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% TODO: Missing reference of the paper `Imagine Yourself: Tuning-Free Personalized Image Generation'

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@misc{li-etal-2024-tigc,
      title={{Tuning-Free Image Customization with Image and Text Guidance}}, 
      author={Pengzhi Li and Qiang Nie and Ying Chen and Xi Jiang and Kai Wu and Yuhuan Lin and Yong Liu and Jinlong Peng and Chengjie Wang and Feng Zheng},
      year={2024},
      eprint={2403.12658},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@article{sun-etal-2024-anycontrol,
  author       = {Yanan Sun and
                  Yanchen Liu and
                  Yinhao Tang and
                  Wenjie Pei and
                  Kai Chen},
  title        = {{AnyControl: Create Your Artwork with Versatile Control on Text-to-Image
                  Generation}},
  journal      = {CoRR},
  volume       = {abs/2406.18958},
  year         = {2024},
}

@article{tan-etal-2024-ominicontrol,
  title={{OminiControl: Minimal and Universal Control for Diffusion Transformer}},
  author={Zhenxiong Tan, Songhua Liu, Xingyi Yang, Qiaochu Xue, and Xinchao Wang},
  journal={arXiv preprint arXiv:2411.15098},
  year={2024}
}

% ----------------TEXT-GUIDED IMAGE EDITING---------------------

@inproceedings{cao-etal-2023-masactrl,
  author       = {{Mingdeng Cao,
                  Xintao Wang,
                  Zhongang Qi,
                  Ying Shan,
                  Xiaohu Qie, and
                  Yinqiang Zheng}},
  title        = {{MasaCtrl: Tuning-Free Mutual Self-Attention Control for Consistent
                  Image Synthesis and Editing}},
  booktitle    = {ICCV},
  pages        = {22503--22513},
  year         = {2023},
}

@inproceedings{patashnik-etal-2023-localizing,
  author       = {{Or Patashnik,
                  Daniel Garibi,
                  Idan Azuri,
                  Hadar Averbuch{-}Elor, and
                  Daniel Cohen{-}Or}},
  title        = {{Localizing Object-level Shape Variations with Text-to-Image Diffusion
                  Models}},
  booktitle    = {ICCV},
  pages        = {22994--23004},
  year         = {2023},
}

@inproceedings{wu-etal-2023-uncovering,
  author       = {{Qiucheng Wu,
                  Yujian Liu,
                  Handong Zhao,
                  Ajinkya Kale,
                  Trung Bui,
                  Tong Yu,
                  Zhe Lin,
                  Yang Zhang, and
                  Shiyu Chang}},
  title        = {{Uncovering the Disentanglement Capability in Text-to-Image Diffusion
                  Models}},
  booktitle    = {CVPR},
  pages        = {1900--1910},
  year         = {2023},
}

@inproceedings{zhang-etal-2023-sine,
  author       = {{Zhixing Zhang,
                  Ligong Han,
                  Arnab Ghosh,
                  Dimitris N. Metaxas, and
                  Jian Ren}},
  title        = {{{SINE:} SINgle Image Editing with Text-to-Image Diffusion Models}},
  booktitle    = {CVPR},
  pages        = {6027--6037},
  year         = {2023},
}

@inproceedings{kawar-etal-2023-imagic,
  author       = {{Bahjat Kawar,
                  Shiran Zada,
                  Oran Lang,
                  Omer Tov,
                  Huiwen Chang,
                  Tali Dekel,
                  Inbar Mosseri, and
                  Michal Irani}},
  title        = {{Imagic: Text-Based Real Image Editing with Diffusion Models}},
  booktitle    = {CVPR},
  pages        = {6007--6017},
  year         = {2023},
}

@inproceedings{brooks-etal-2023-instructpix2pix,
  author       = {{Tim Brooks,
                  Aleksander Holynskim, and
                  Alexei A. Efros}},
  title        = {{InstructPix2Pix: Learning to Follow Image Editing Instructions}},
  booktitle    = {CVPR},
  pages        = {18392--18402},
  year         = {2023},
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{xu-etal-2023-infedit,
      title={{Inversion-Free Image Editing with Natural Language}}, 
      author={{Sihan Xu, Yidong Huang, Jiayi Pan, Ziqiao Ma, and Joyce Chai}},
      year={2023},
      eprint={2312.04965},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{liu-etal-2024-understanding,
      title={{Towards Understanding Cross and Self-Attention in Stable Diffusion for Text-Guided Image Editing}}, 
      author={{Bingyan Liu, Chengyu Wang, Tingfeng Cao, Kui Jia, and Jun Huang}},
      year={2024},
      eprint={2403.03431},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{song-etal-2024-doubly,
      title={{Doubly Abductive Counterfactual Inference for Text-based Image Editing}}, 
      author={{Xue Song, Jiequan Cui, Hanwang Zhang, Jingjing Chen, Richang Hong, and Yu-Gang Jiang}},
      year={2024},
      eprint={2403.02981},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{guo-etal-2023-focus,
      title={{Focus on Your Instruction: Fine-grained and Multi-instruction Image Editing by Attention Modulation}}, 
      author={{Qin Guo and Tianwei Lin}},
      year={2023},
      eprint={2312.10113},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{nam-etal-2023-contrastive,
      title={{Contrastive Denoising Score for Text-guided Latent Diffusion Image Editing}}, 
      author={{Hyelin Nam, Gihyun Kwon, Geon Yeong Park, and Jong Chul Ye}},
      year={2023},
      eprint={2311.18608},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{shi-etal-2023-dragdiffusion,
      title={{DragDiffusion: Harnessing Diffusion Models for Interactive Point-based Image Editing}}, 
      author={{Yujun Shi, Chuhui Xue, Jun Hao Liew, Jiachun Pan, Hanshu Yan, Wenqing Zhang, Vincent Y. F. Tan, and Song Bai}},
      year={2023},
      eprint={2306.14435},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{mou-etal-2024-diffeditor,
      title={{DiffEditor: Boosting Accuracy and Flexibility on Diffusion-based Image Editing}}, 
      author={{Chong Mou, Xintao Wang, Jiechong Song, Ying Shan, and Jian Zhang}},
      year={2024},
      eprint={2402.02583},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{ling-etal-2023-freedrag,
      title={{FreeDrag: Feature Dragging for Reliable Point-based Image Editing}}, 
      author={{Pengyang Ling, Lin Chen, Pan Zhang, Huaian Chen, Yi Jin, and Jinjin Zheng}},
      year={2023},
      eprint={2307.04684},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{lin-etal-2023-textdriven,
      title={{Text-Driven Image Editing via Learnable Regions}}, 
      author={{Yuanze Lin, Yi-Wen Chen, Yi-Hsuan Tsai, Lu Jiang, and Ming-Hsuan Yang}},
      year={2023},
      eprint={2311.16432},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{brack-2023-leditsplusplus,
      title={{LEDITS++: Limitless Image Editing using Text-to-Image Models}}, 
      author={{Manuel Brack, Felix Friedrich, Katharina Kornmeier, Linoy Tsaban, Patrick Schramowski, Kristian Kersting, and Apolinário Passos}},
      year={2023},
      eprint={2311.16711},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{huang-etal-2023-smartedit,
      title={{SmartEdit: Exploring Complex Instruction-based Image Editing with Multimodal Large Language Models}}, 
      author={{Yuzhou Huang, Liangbin Xie, Xintao Wang, Ziyang Yuan, Xiaodong Cun, Yixiao Ge, Jiantao Zhou, Chao Dong, Rui Huang, Ruimao Zhang, and Ying Shan}},
      year={2023},
      eprint={2312.06739},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{nguyen-etal-2024-edit,
      title={{Edit One for All: Interactive Batch Image Editing}}, 
      author={{Thao Nguyen, Utkarsh Ojha, Yuheng Li, Haotian Liu, and Yong Jae Lee}},
      year={2024},
      eprint={2401.10219},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{fu-etal-2024=guiding,
title={{Guiding Instruction-based Image Editing via Multimodal Large Language Models}},
author={{Tsu-Jui Fu, Wenze Hu, Xianzhi Du, William Yang Wang, Yinfei Yang, and Zhe Gan}},
booktitle={ICLR},
year={2024},
pages={1--24}
}

% NOTE: This reference will be replaced with ICLR version as soon as the metadata is updated
@misc{nie-etal-2024-blessing,
      title={{The Blessing of Randomness: SDE Beats ODE in General Diffusion-based Image Editing}}, 
      author={{Shen Nie, Hanzhong Allan Guo, Cheng Lu, Yuhao Zhou, Chenyu Zheng, and Chongxuan Li}},
      year={2024},
      eprint={2311.01410},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{geng-etal-2024-motionguidance,
title={{Motion Guidance: Diffusion-Based Image Editing with Differentiable Motion Estimators}},
author={{Daniel Geng and Andrew Owens}},
booktitle={ICLR},
year={2024},
}

@inproceedings{yang-etal-2024-objectaware,
title={{Object-Aware Inversion and Reassembly for Image Editing}},
author={{Zhen Yang, Ganggui Ding, Wen Wang, Hao Chen, Bohan Zhuang, and Chunhua Shen}},
booktitle={ICLR},
year={2024},
pages={1--20}
}

@inproceedings{cho-etal-2024-noisemapguidance,
title={{Noise Map Guidance: Inversion with Spatial Context for Real Image Editing}},
author={{Hansam Cho, Jonghyun Lee, Seoung Bum Kim, Tae-Hyun Oh, and Yonghyun Jeong}},
booktitle={ICLR},
year={2024},
pages={1--20}
}

@inproceedings{duan-etal-2024-tuningfree,
  author       = {{Xiaoyue Duan,
                  Shuhao Cui,
                  Guoliang Kang,
                  Baochang Zhang,
                  Zhengcong Fei,
                  Mingyuan Fan, and
                  Junshi Huang}},
  title        = {{Tuning-Free Inversion-Enhanced Control for Consistent Image Editing}},
  booktitle    = {AAAI},
  pages        = {1644--1652},
  year         = {2024},
}

@inproceedings{qiao-etal-2024-baret,
  author       = {{Yuming Qiao,
                  Fanyi Wang,
                  Jingwen Su,
                  Yanhao Zhang,
                  Yunjie Yu,
                  Siyu Wu, and
                  Guo{-}Jun Qi}},
  title        = {{{BARET:} Balanced Attention Based Real Image Editing Driven by Target-Text
                  Inversion}},
  booktitle    = {AAAI},
  pages        = {4560--4568},
  year         = {2024},
}

@inproceedings{yu-etal-2024-accelerating,
  author       = {{Zihao Yu,
                  Haoyang Li,
                  Fangcheng Fu,
                  Xupeng Miao, and
                  Bin Cui}},
  title        = {{Accelerating Text-to-Image Editing via Cache-Enabled Sparse Diffusion
                  Inference}},
  booktitle    = {AAAI},
  pages        = {16605--16613},
  year         = {2024},
}

@inproceedings{hou-etal-2024-highfidelity,
  author       = {{Chen Hou,
                  Guoqiang Wei, and
                  Zhibo Chen}},
  title        = {{High-Fidelity Diffusion-Based Image Editing}},
  booktitle    = {AAAI},
  pages        = {2184--2192},
  year         = {2024},
}

@inproceedings{ma-etal-2024-adapedit,
  author       = {{Zhiyuan Ma,
                  Guoli Jia, and
                  Bowen Zhou}},
  title        = {{AdapEdit: Spatio-Temporal Guided Adaptive Editing Algorithm for Text-Based
                  Continuity-Sensitive Image Editing}},
  booktitle    = {AAAI},
  pages        = {4154--4161},
  year         = {2024},
}

@misc{feng-etal-2024-item,
      title={{An Item is Worth a Prompt: Versatile Image Editing with Disentangled Control}}, 
      author={A{osong Feng, Weikang Qiu, Jinbin Bai, Kaicheng Zhou, Zhen Dong, Xiao Zhang, Rex Ying, and Leandros Tassiulas}},
      year={2024},
      eprint={2403.04880},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{cui-etal-2024-stabledrag,
      title={{StableDrag: Stable Dragging for Point-based Image Editing}}, 
      author={{Yutao Cui, Xiaotong Zhao, Guozhen Zhang, Shengming Cao, Kai Ma, and Limin Wang}},
      year={2024},
      eprint={2403.04437},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{lyu-etal2024-onedimensional,
      title={{One-Dimensional Adapter to Rule Them All: Concepts, Diffusion Models and Erasing Applications}}, 
      author={{Mengyao Lyu, Yuhong Yang, Haiwen Hong, Hui Chen, Xuan Jin, Yuan He, Hui Xue, Jungong Han, and Guiguang Ding}},
      year={2024},
      eprint={2312.16145},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{wang-etal-2024-texfit,
  author       = {{Tongxin Wang and
                  Mang Ye}},
  title        = {{TexFit: Text-Driven Fashion Image Editing with Diffusion Models}},
  booktitle    = {AAAI},
  pages        = {10198--10206},
  year         = {2024},
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{zhang-etal-2023-diffmorpher,
      title={{DiffMorpher: Unleashing the Capability of Diffusion Models for Image Morphing}}, 
      author={{Kaiwen Zhang, Yifan Zhou, Xudong Xu, Xingang Pan, and Bo Dai}},
      year={2023},
      eprint={2312.07409},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{yang-etal-2024-editworld,
      title={{EditWorld: Simulating World Dynamics for Instruction-Following Image Editing}}, 
      author={{Ling Yang, Bohan Zeng, Jiaming Liu, Hong Li, Minghao Xu, Wentao Zhang, and Shuicheng Yan}},
      year={2024},
      eprint={2405.14785},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{jin-etal-2024-reasonpix2pix,
      title={{ReasonPix2Pix: Instruction Reasoning Dataset for Advanced Image Editing}}, 
      author={{Ying Jin, Pengyang Ling, Xiaoyi Dong, Pan Zhang, Jiaqi Wang, and Dahua Lin}},
      year={2024},
      eprint={2405.11190},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{chen-etal-2024-tinoedit,
      title={{TiNO-Edit: Timestep and Noise Optimization for Robust Diffusion-Based Image Editing}}, 
      author={Sherry X. Chen and Yaron Vaxman and Elad Ben Baruch and David Asulin and Aviad Moreshet and Kuo-Chin Lien and Misha Sra and Pradeep Sen},
      year={2024},
      eprint={2404.11120},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% TODO: Missing reference of the paper `Person in Place: Generating Associative Skeleton-Guidance Maps for Human-Object Interaction Image Editing'

% TODO: Missing reference of the paper `Referring Image Editing: Object-level Image Editing via Referring Expressions'

% TODO: Missing reference of the paper `Prompt Augmentation for Self-supervised Text-guided Image Manipulation'

@InProceedings{bobkov-etal-2024-devil,
    author    = {Bobkov, Denis and Titov, Vadim and Alanov, Aibek and Vetrov, Dmitry},
    title     = {{The Devil is in the Details: StyleFeatureEditor for Detail-Rich StyleGAN Inversion and High Quality Image Editing}},
    booktitle = {CVPR},
    month     = {June},
    year      = {2024},
    pages     = {9337-9346}
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@misc{lu-etal-2024-regiondrag,
      title={{RegionDrag: Fast Region-Based Image Editing with Diffusion Models}}, 
      author={Jingyi Lu and Xinghui Li and Kai Han},
      year={2024},
      eprint={2407.18247},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@misc{meng-etal-2024-instructgie,
      title={{InstructGIE: Towards Generalizable Image Editing}}, 
      author={Zichong Meng and Changdi Yang and Jun Liu and Hao Tang and Pu Zhao and Yanzhi Wang},
      year={2024},
      eprint={2403.05018},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@misc{wu-etal-2024-turboedit,
      title={{TurboEdit: Instant Text-based Image Editing}}, 
      author={Zongze Wu and Nicholas Kolkin and Jonathan Brandt and Richard Zhang and Eli Shechtman},
      year={2024},
      eprint={2408.08332},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@article{kang-etal-2024-eta-inversion,
  author       = {Wonjun Kang and
                  Kevin Galim and
                  Hyung Il Koo},
  title        = {{Eta Inversion: Designing an Optimal Eta Function for Diffusion-based
                  Real Image Editing}},
  journal      = {CoRR},
  volume       = {abs/2403.09468},
  year         = {2024},
}

% TODO: Missing reference of the paper `SwapAnything: Enabling Arbitrary Object Swapping in Personalized Image Editing'

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@article{titov-etal-2024-guide-and-rescale,
  author       = {Vadim Titov and
                  Madina Khalmatova and
                  Alexandra Ivanova and
                  Dmitry Vetrov and
                  Aibek Alanov},
  title        = {{Guide-and-Rescale: Self-Guidance Mechanism for Effective Tuning-Free
                  Real Image Editing}},
  journal      = {CoRR},
  volume       = {abs/2409.01322},
  year         = {2024},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@article{wu-etal-2024-freediff,
  author       = {Wei Wu and
                  Qingnan Fan and
                  Shuai Qin and
                  Hong Gu and
                  Ruoyu Zhao and
                  Antoni B. Chan},
  title        = {{FreeDiff: Progressive Frequency Truncation for Image Editing with
                  Diffusion Models}},
  journal      = {CoRR},
  volume       = {abs/2404.11895},
  year         = {2024},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@article{nitzan-etal-2024-lazy,
  author       = {Yotam Nitzan and
                  Zongze Wu and
                  Richard Zhang and
                  Eli Shechtman and
                  Daniel Cohen{-}Or and
                  Taesung Park and
                  Micha{\"{e}}l Gharbi},
  title        = {{Lazy Diffusion Transformer for Interactive Image Editing}},
  journal      = {CoRR},
  volume       = {abs/2404.12382},
  year         = {2024},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@article{ren-etal-2024-byteedit,
  author       = {Yuxi Ren and
                  Jie Wu and
                  Yanzuo Lu and
                  Huafeng Kuang and
                  Xin Xia and
                  Xionghui Wang and
                  Qianqian Wang and
                  Yixing Zhu and
                  Pan Xie and
                  Shiyin Wang and
                  Xuefeng Xiao and
                  Yitong Wang and
                  Min Zheng and
                  Lean Fu},
  title        = {{ByteEdit: Boost, Comply and Accelerate Generative Image Editing}},
  journal      = {CoRR},
  volume       = {abs/2404.04860},
  year         = {2024},
}

@misc{kulikov-etal-2024-flowedit,
      title={{FlowEdit: Inversion-Free Text-Based Editing Using Pre-Trained Flow Models}}, 
      author={Vladimir Kulikov and Matan Kleiner and Inbar Huberman-Spiegelglas and Tomer Michaeli},
      year={2024},
      eprint={2412.08629},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% ----------------TEXT IMAGE GENERATION---------------------

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{tuo-etal-2024-anytext,
      title={{AnyText: Multilingual Visual Text Generation And Editing}}, 
      author={{Yuxiang Tuo, Wangmeng Xiang, Jun-Yan He, Yifeng Geng, and Xuansong Xie}},
      year={2024},
      eprint={2311.03054},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{zhangli-etal-2024-scenetextgen,
      title={{SceneTextGen: Layout-Agnostic Scene Text Image Synthesis with Diffusion Models}}, 
      author={Qilong Zhangli and Jindong Jiang and Di Liu and Licheng Yu and Xiaoliang Dai and Ankit Ramchandani and Guan Pang and Dimitris N. Metaxas and Praveen Krishnan},
      year={2024},
      eprint={2406.01062},
      archivePrefix={arXiv},
}

% ----------------DATASETS---------------------

@inproceedings{lin-etal-2014-mscoco,
  author       = {{Tsung{-}Yi Lin,
                  Michael Maire,
                  Serge J. Belongie,
                  James Hays,
                  Pietro Perona,
                  Deva Ramanan,
                  Piotr Doll{\'{a}}r, and
                  C. Lawrence Zitnick}},
  title        = {{Microsoft {COCO:} Common Objects in Context}},
  booktitle    = {ECCV},
  volume       = {8693},
  pages        = {740--755},
  year         = {2014},
}

@inproceedings{sharma-etal-2018-conceptual-captions,
  author       = {{Piyush Sharma,
                  Nan Ding,
                  Sebastian Goodman, and
                  Radu Soricut}},
  title        = {{Conceptual Captions: {A} Cleaned, Hypernymed, Image Alt-text Dataset
                  For Automatic Image Captioning}},
  booktitle    = {ACL},
  pages        = {2556--2565},
  year         = {2018},
}

@inproceedings{schuhmann-etal-2022-laion,
  author       = {{Christoph Schuhmann,
                  Romain Beaumont,
                  Richard Vencu,
                  Cade Gordon,
                  Ross Wightman,
                  Mehdi Cherti,
                  Theo Coombes,
                  Aarush Katta,
                  Clayton Mullis,
                  Mitchell Wortsman,
                  Patrick Schramowski,
                  Srivatsa Kundurthy,
                  Katherine Crowson,
                  Ludwig Schmidt,
                  Robert Kaczmarczyk, and
                  Jenia Jitsev}},
  title        = {{{LAION-5B:} An Open Large-Scale Dataset for Training Next Generation
                  Image-Text Models}},
  booktitle    = {NeurIPS},
  year         = {2022},
}

% ----------------PREREQUISITES---------------------
@inproceedings{ho-etal-2020-ddpm,
  author       = {{Jonathan Ho,
                  Ajay Jain, and
                  Pieter Abbeel}},
  title        = {{Denoising Diffusion Probabilistic Models}},
  booktitle    = {NeurIPS},
  year         = {2020},
}

@article{raffel-etal-2020-t5,
  author       = {{Colin Raffel,
                  Noam Shazeer,
                  Adam Roberts,
                  Katherine Lee,
                  Sharan Narang,
                  Michael Matena,
                  Yanqi Zhou,
                  Wei Li, and
                  Peter J. Liu}},
  title        = {{Exploring the Limits of Transfer Learning with a Unified Text-to-Text
                  Transformer}},
  journal      = {J. Mach. Learn. Res.},
  volume       = {21},
  pages        = {140:1--140:67},
  year         = {2020},
}

@inproceedings{song-etal-2021-ddim,
  author       = {{Jiaming Song,
                  Chenlin Meng, and
                  Stefano Ermon}},
  title        = {{Denoising Diffusion Implicit Models}},
  booktitle    = {ICLR},
  year         = {2021},
}

@inproceedings{dhariwal-etal-2021-classifier-guidance,
  author       = {{Prafulla Dhariwal and
                  Alexander Quinn Nichol}},
  title        = {{Diffusion Models Beat GANs on Image Synthesis}},
  booktitle    = {NeurIPS},
  pages        = {8780--8794},
  year         = {2021},
}

@inproceedings{radford-etal-2021-clip,
  author       = {{Alec Radford,
                  Jong Wook Kim,
                  Chris Hallacy,
                  Aditya Ramesh,
                  Gabriel Goh,
                  Sandhini Agarwal,
                  Girish Sastry,
                  Amanda Askell,
                  Pamela Mishkin,
                  Jack Clark,
                  Gretchen Krueger, and
                  Ilya Sutskever}},
  title        = {{Learning Transferable Visual Models From Natural Language Supervision}},
  booktitle    = {ICML},
  volume       = {139},
  pages        = {8748--8763},
  year         = {2021},
}

@article{ho-etal-2022-classifier-free-guidance,
  author       = {{Jonathan Ho and
                  Tim Salimans}},
  title        = {{Classifier-Free Diffusion Guidance}},
  journal      = {CoRR},
  year         = {2022},
}


% ----------------DIFFUSION MODELS MEET LLMS---------------------
% SUR-Adapter, ELLA, ParaDiffusion can be found using "Ctrl+F"/"Command+F"

@misc{zheng-etal-2024-minigpt5,
      title={{MiniGPT-5: Interleaved Vision-and-Language Generation via Generative Vokens}}, 
      author={{Kaizhi Zheng, Xuehai He, and Xin Eric Wang}},
      year={2024},
      eprint={2310.02239},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% ----------------DIFFUSION MODELS MEET MAMBA---------------------

@misc{gu-etal-2023-mamba,
      title={{Mamba: Linear-Time Sequence Modeling with Selective State Spaces}}, 
      author={{Albert Gu and Tri Dao}},
      year={2023},
      eprint={2312.00752},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{fei-etal-2024-dis,
      title={Scalable Diffusion Models with State Space Backbone}, 
      author={{Zhengcong Fei, Mingyuan Fan, Changqian Yu, and Junshi Huang}},
      year={2024},
      eprint={2402.05608},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{hu-etal-2024-zigma,
      title={{ZigMa: Zigzag Mamba Diffusion Model}}, 
      author={{Vincent Tao Hu, Stefan Andreas Baumann, Ming Gui, Olga Grebenkova, Pingchuan Ma, Johannes Fischer, and Bjorn Ommer}},
      year={2024},
      eprint={2403.13802},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{teng-etal-2024-dim,
      title={{DiM: Diffusion Mamba for Efficient High-Resolution Image Synthesis}}, 
      author={Yao Teng and Yue Wu and Han Shi and Xuefei Ning and Guohao Dai and Yu Wang and Zhenguo Li and Xihui Liu},
      year={2024},
      eprint={2405.14224},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% ----------------DIFFUSION MODELS MEET FEDEREATED LEARNING---------------------
@misc{jothiraj-etal-2023-phoenix,
      title={{Phoenix: A Federated Generative Diffusion Model}}, 
      author={{Fiona Victoria Stanley Jothiraj and Afra Mashhadi}},
      year={2023},
      eprint={2306.04098},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

% NOTE: This reference will be replaced with AAAI version as soon as the metadata is updated
@misc{yang-etal-2023-exploring,
      title={{Exploring One-shot Semi-supervised Federated Learning with A Pre-trained Diffusion Model}}, 
      author={{Mingzhao Yang, Shangchao Su, Bin Li, and Xiangyang Xue}},
      year={2023},
      eprint={2305.04063},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{li-etal-2023-feddiff,
      title={{FedDiff: Diffusion Model Driven Federated Learning for Multi-Modal and Multi-Clients}}, 
      author={{DaiXun Li, Weiying Xie, ZiXuan Wang, YiBing Lu, Yunsong Li, and Leyuan Fang}},
      year={2023},
      eprint={2401.02433},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{yang-etal-2023-oneshot,
      title={{One-Shot Federated Learning with Classifier-Guided Diffusion Models}}, 
      author={{Mingzhao Yang, Shangchao Su, Bin Li, and Xiangyang Xue}},
      year={2023},
      eprint={2311.08870},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@INPROCEEDINGS{tun-etal-2023-federated,
  author={{Tun, Ye Lin, Thwal, Chu Myaet, Yoon, Ji Su, Kang, Sun Moo, Zhang, Chaoning, and Hong, Choong Seon}},
  title={{Federated Learning with Diffusion Models for Privacy-Sensitive Vision Tasks}}, 
  booktitle={ATC}, 
  year={2023},
  pages={305-310},
}


% ----------------DIFFUSION TRASNFORMER-BASED METHODS---------------------
@inproceedings{peebles-etal-2023-dit,
  author       = {{William Peebles and
                  Saining Xie}},
  title        = {{Scalable Diffusion Models with Transformers}},
  booktitle    = {ICCV},
  pages        = {4172--4182},
  year         = {2023},
}

@inproceedings{bao-etal-2023-unidiffusers,
  author       = {{Fan Bao,
                  Shen Nie,
                  Kaiwen Xue,
                  Chongxuan Li,
                  Shi Pu,
                  Yaole Wang,
                  Gang Yue,
                  Yue Cao,
                  Hang Su, and
                  Jun Zhu}},
  title        = {{One Transformer Fits All Distributions in Multi-Modal Diffusion at
                  Scale}},
  booktitle    = {ICML},
  pages        = {1692--1717},
  year         = {2023},
}

@inproceedings{gao-etal-2023-mdt-v1,
  author       = {{Shanghua Gao,
                  Pan Zhou,
                  Ming{-}Ming Cheng, and
                  Shuicheng Yan}},
  title        = {{Masked Diffusion Transformer is a Strong Image Synthesizer}},
  booktitle    = {ICCV},
  pages        = {23107--23116},
  year         = {2023},
}

@misc{gao-etal-2024-mdtv2,
      title={{MDTv2: Masked Diffusion Transformer is a Strong Image Synthesizer}}, 
      author={{Shanghua Gao, Pan Zhou, Ming-Ming Cheng, and Shuicheng Yan}},
      year={2024},
      eprint={2303.14389},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{zheng-etal-2024-mask-dit,
title={{Fast Training of Diffusion Models with Masked Transformers}},
author={{Hongkai Zheng, Weili Nie, Arash Vahdat, and Anima Anandkumar}},
journal={TMLR},
issn={2835-8856},
year={2024},
}


% ----------------DIFFUSION MODELS FOR TEXT GENERATION---------------------

@inproceedings{austin-etal-2021-d3pm,
  author       = {{Jacob Austin,
                  Daniel D. Johnson,
                  Jonathan Ho,
                  Daniel Tarlow, and
                  Rianne van den Berg}},
  title        = {{Structured Denoising Diffusion Models in Discrete State-Spaces}},
  booktitle    = {NeurIPS},
  pages        = {17981--17993},
  year         = {2021},
}

@inproceedings{li-etal-2022-diffusionlm,
  author       = {{Xiang Li,
                  John Thickstun,
                  Ishaan Gulrajani,
                  Percy Liang, and
                  Tatsunori B. Hashimoto}},
  title        = {{Diffusion-LM Improves Controllable Text Generation}},
  booktitle    = {NeurIPS},
  year         = {2022},
}

@misc{zhu-etal-2022-ddcap,
      title={{Exploring Discrete Diffusion Models for Image Captioning}}, 
      author={{Zixin Zhu, Yixuan Wei, Jianfeng Wang, Zhe Gan, Zheng Zhang, Le Wang, Gang Hua, Lijuan Wang, Zicheng Liu, and Han Hu}},
      year={2022},
      eprint={2211.11694},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{he-etal-2023-diffusionbert,
    title = "{{D}iffusion{BERT}: Improving Generative Masked Language Models with Diffusion Models}",
    author = "He, Zhengfu,
      Sun, Tianxiang,
      Tang, Qiong,
      Wang, Kuanning,
      Huang, Xuanjing, and
      Qiu, Xipeng",
    booktitle = "ACL",
    month = jul,
    year = "2023",
    pages = "4521--4534",
}

@inproceedings{chen-etal-2023-bit-diffusion,
title={{Analog Bits: Generating Discrete Data using Diffusion Models with Self-Conditioning}},
author={{Ting Chen, Ruixiang Zhang, and Geoffrey Hinton}},
booktitle={ICLR},
year={2023},
pages={1--23},
}

@inproceedings{luo-etal-2023-scd-net,
  author       = {{Jianjie Luo,
                  Yehao Li,
                  Yingwei Pan,
                  Ting Yao,
                  Jianlin Feng,
                  Hongyang Chao, and
                  Tao Mei}},
  title        = {{Semantic-Conditional Diffusion Networks for Image Captioning}},
  booktitle    = {CVPR},
  pages        = {23359--23368},
  year         = {2023},
}

@inproceedings{gong-etal-2023-diffuseq,
title={{DiffuSeq: Sequence to Sequence Text Generation with Diffusion Models}},
author={{Shansan Gong, Mukai Li, Jiangtao Feng, Zhiyong Wu, and Lingpeng Kong}},
booktitle={ICLR},
year={2023},
pages={1--20},
}

@misc{gong-etal-2023-diffuseqv2,
      title={{DiffuSeq-v2: Bridging Discrete and Continuous Text Spaces for Accelerated Seq2Seq Diffusion Models}}, 
      author={{Shansan Gong, Mukai Li, Jiangtao Feng, Zhiyong Wu, and Lingpeng Kong}},
      year={2023},
      eprint={2310.05793},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{he-etal-2023-diffcap,
      title={{DiffCap: Exploring Continuous Diffusion on Image Captioning}}, 
      author={{Yufeng He, Zefan Cai, Xu Gan, and Baobao Chang}},
      year={2023},
      eprint={2305.12144},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{li-etal-2023-glyphdiffusion,
      title={{GlyphDiffusion: Text Generation as Image Generation}}, 
      author={{Junyi Li, Wayne Xin Zhao, Jian-Yun Nie, and Ji-Rong Wen}},
      year={2023},
      eprint={2304.12519},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}