index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <!-- Replace the content tag with appropriate information -->
  <meta name="description" content="">
  <meta property="og:title" content="ACE"/>
  <meta property="og:description" content="All-round Creator and Editor Following Instructions via Diffusion Transformer"/>
  <meta property="og:url" content=""/>
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
<!--  <meta property="og:image" content="static/image/your_banner_image.png" />-->
<!--  <meta property="og:image:width" content="1200"/>-->
<!--  <meta property="og:image:height" content="630"/>-->


<!--  <meta name="twitter:title" content="TWITTER BANNER TITLE META TAG">-->
<!--  <meta name="twitter:description" content="TWITTER BANNER DESCRIPTION META TAG">-->
<!--  &lt;!&ndash; Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600&ndash;&gt;-->
<!--  <meta name="twitter:image" content="static/images/your_twitter_banner_image.png">-->
<!--  <meta name="twitter:card" content="summary_large_image">-->
<!--  &lt;!&ndash; Keywords for your paper to be indexed by&ndash;&gt;-->
<!--  <meta name="keywords" content="KEYWORDS SHOULD BE PLACED HERE">-->
<!--  <meta name="viewport" content="width=device-width, initial-scale=1">-->


  <title>ACE: All-round Creator and Editor Following Instructions via Diffusion Transformer</title>
  <link rel="icon" type="image/x-icon" href="static/images/logo.png">
  <link href=""
  rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
  href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script src="static/js/index.js"></script>
</head>
<body>

<!--TODO：发布时间预告-->
  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-2 publication-title">
              <img src="static/images/icon.png" class="title_icon">: All-round Creator and Editor Following Instructions via Diffusion Transformer</h1>
            <div class="is-size-5 publication-authors">
              <!-- Paper authors -->
              <span class="author-block">
                <a href="" target="_blank">Zhen Han</a><sup>*</sup>,</span>
              <span class="author-block">
                <a href="" target="_blank">Zeyinzi Jiang</a><sup>*</sup>,</span>
              <span class="author-block">
                <a href="" target="_blank">Yulin Pan</a><sup>*</sup>,</span>
              <span class="author-block">
                <a href="" target="_blank">Jingfeng Zhang</a><sup>*</sup>,</span>
              <span class="author-block">
                <a href="" target="_blank">Chaojie Mao</a><sup>*†</sup>,</span>
            </div>
            <div class="is-size-5 publication-authors">
              <span class="author-block">
                <a href="" target="_blank">Chenwei Xie</a>,</span>
              <span class="author-block">
                <a href="" target="_blank">Yu Liu</a>,</span>
              <span class="author-block">
                <a href="" target="_blank">Jingren Zhou</a></span>
            </div>

                  <div class="is-size-5 publication-authors">
                    <span class="author-block">Tongyi Lab, Alibaba Group</span>
                    <span class="eql-cntrb"><small><br>
                      <sup>*</sup> Equal Contribution. Order is determined by random dice rolling.
                      <sup>†</sup> Project leader.</small></span>
                  </div>
                  <div class="is-size-5 publication-authors">
                    <span class="eql-cntrb"><small>
                      <b>Acknowledgments:</b> Haiming Zhao, Yuntao
                    Hong, You Wu, Jixuan Chen, Yuwei Wang, and Sheng Yao for their data contributions, and Lianghua
                    Huang, Kai Zhu, and Yutong Feng for their discussions, suggestions, and the sharing of resources.
                    </small></span>
                  </div>

                  <div class="column has-text-centered">
                    <div class="publication-links">
                  <!-- Github link -->
                  <span class="link-block">
                    <a href="https://github.com/ali-vilab/ACE/" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>
                </span>
                  <!-- Scepter Github link -->
                  <span class="link-block">
                    <a href="https://github.com/modelscope/scepter" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      🪄
                    </span>
                    <span>Scepter</span>
                  </a>
                </span>
                  <!-- Huggingface Space link -->
                  <span class="link-block">
                    <a href="https://huggingface.co/spaces/scepter-studio/ACE-Chat" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      🤗
                    </span>
                    <span>Demo</span>
                  </a>
                </span>

                <!-- ArXiv abstract Link -->
                <span class="link-block">
                  <a href="https://arxiv.org/abs/2410.00086" target="_blank"
                  class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>
            </div>
                    <div class="publication-links">
                  <!-- Huggingface Checkpoint link -->
                  <span class="link-block">
                    <a href="https://huggingface.co/scepter-studio/ACE-0.6B-512px" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      🤗
                    </span>
                    <span>Checkpoint</span>
                  </a>
                </span>
                  <!-- ModelScope Checkpoint link -->
                  <span class="link-block">
                    <a href="https://www.modelscope.cn/models/iic/ACE-0.6B-512px" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <img src="static/images/modelscope_icon.png">
                    </span>
                    <span>Checkpoint</span>
                  </a>
                </span>
            </div>
          </div>
        </div>
      </div>
    </div>
      <div class="container is-max-desktop">
        <b><h4>[2024/11/01] 🔥 We release our <a href="https://huggingface.co/spaces/scepter-studio/ACE-Chat" target="_blank">ACE-Chat</a> on Huggingface Space.</h4></b>
        <b><h4>[2024/11/01] 🔥 The ACE checkpoint has been uploaded to both <a href="https://www.modelscope.cn/models/iic/ACE-0.6B-512px" target="_blank">ModelScope</a> and <a href="https://huggingface.co/scepter-studio/ACE-0.6B-512px" target="_blank">HuggingFace</a> platforms.</h4></b>
        <b><h4>[2024/11/05] 🔥 We release our <a href="https://github.com/ali-vilab/ACE/" target="_blank">ACE Code</a> on GitHub.</h4></b>
      </div>
  </div>
</section>


<!-- Teaser video-->
<section class="hero teaser is-light">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <img src="static/images/slogan_v2.gif" alt="slogan_v1"/>
      <video poster="" id="tree" autoplay controls muted loop height="100%">
        <!-- Your video here -->
        <source src="static/videos/teaser_final.webm"
        type="video/mp4">
      </video>
<!--      <h2 class="subtitle has-text-centered">-->
<!--        Multi-turn image editing results of ACE. ACE supports a wide range of image editing-->
<!--        and generation tasks through natural language instructions, allowing complex editing requests to be-->
<!--        easily accomplished through multi-turn interactions.-->
<!--      </h2>-->
    </div>
  </div>
</section>
<!-- End teaser video -->

<!-- Paper abstract -->
<section class="section hero">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            Diffusion models have emerged as a powerful generative technology and have been found to be applicable in various scenarios.
            Most existing foundational diffusion models are primarily designed for text-guided visual generation and do not support multi-modal conditions, which are essential for many visual editing tasks.
            This limitation prevents these foundational diffusion models from serving as a unified model in the field of visual generation, like GPT-4 in the natural language processing field.
            In this work, we propose ACE, an All-round Creator and Editor, which achieves comparable performance compared to those expert models in a wide range of visual generation tasks.
            To achieve this goal, we first introduce a unified condition format termed Long-context Condition Unit (LCU), and propose a novel Transformer-based diffusion model that uses LCU as input, aiming for joint training across various generation and editing tasks.
            Furthermore, we propose an efficient data collection approach to address the issue of the absence of available training data.
            It involves acquiring pairwise images with synthesis-based or clustering-based pipelines and supplying these pairs with accurate textual instructions by leveraging a fine-tuned multi-modal large language model.
            To comprehensively evaluate the performance of our model, we publish a benchmark of manually annotated image pairs across a variety of visual generation tasks.
            The extensive experimental results demonstrate the superiority of our model in visual generation fields.
            Thanks to the all-in-one capabilities of our model, we can easily build a chat system that responds to any image creation request using a single model to serve as the backend, avoiding the cumbersome pipeline typically employed in visual agents.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End paper abstract -->


<!-- Image carousel -->
<section class="hero is-small is-light">
  <div class="hero-body">
    <div class="container">
      <div id="method-carousel" class="carousel results-carousel">
       <div class="item">
        <!-- Your image here -->
        <img src="static/images/tasks.png" alt="MY ALT TEXT"/>
        <h2 class="subtitle has-text-centered">
          The overview of all generation and editing types, categorized based on different visual
          modality input conditions, can be defined with the proposed input paradigm.
        </h2>
      </div>
      <div class="item">
        <!-- Your image here -->
        <img src="static/images/method.png" alt="MY ALT TEXT"/>
      </div>
<!--      <div class="item">-->
<!--        &lt;!&ndash; Your image here &ndash;&gt;-->
<!--        <img src="static/images/carousel3.jpg" alt="MY ALT TEXT"/>-->
<!--        <h2 class="subtitle has-text-centered">-->
<!--         Third image description.-->
<!--       </h2>-->
<!--     </div>-->
  </div>
</div>
</div>
</section>
<!-- End image carousel -->


<!-- Video carousel -->
<section class="hero is-small">
  <div class="hero-body">
    <div class="container">
      <h2 class="title is-3" style="margin-left: 10%;">Application 1: ChatBot</h2>
<!--      <h3 class="title is-5">-->
<!--        we build a Chat Bot application to achieve chat-based image generation and editing. Rather than cumbersome visual agent pipeline, our chat bot support all image-->
<!--        creation request with only one model serves as backend, hence achieves significant efficiency improvement compared with visual agent.-->
<!--        The Chat Bot's link is: xxxx-->
<!--      </h3>-->
        <div class="chatbot-video">
          <video poster="" id="chatbot_video" autoplay controls muted loop height="100%">
            <!-- Your video file here -->
            <source src="static/videos/demo_chat_crop_v3.webm"
            type="video/mp4">
          </video>
        </div>
    </div>
  </div>
</section>
<!-- End video carousel -->


<!--&lt;!&ndash; Youtube video &ndash;&gt;-->
<!--<section class="hero is-small is-light">-->
<!--  <div class="hero-body">-->
<!--    <div class="container">-->
<!--      &lt;!&ndash; Paper video. &ndash;&gt;-->
<!--      <h2 class="title is-3">Video Presentation</h2>-->
<!--      <div class="columns is-centered has-text-centered">-->
<!--        <div class="column is-four-fifths">-->

<!--          <div class="publication-video">-->
<!--            &lt;!&ndash; Youtube embed code here &ndash;&gt;-->
<!--            <iframe src="https://www.youtube.com/embed/JkaxUblCGz0" frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>-->
<!--          </div>-->
<!--        </div>-->
<!--      </div>-->
<!--    </div>-->
<!--  </div>-->
<!--</section>-->
<!-- End youtube video -->


<!-- Video carousel -->
<section class="hero is-small is-light">
  <div class="hero-body">
    <div class="container">
      <h2 class="title is-3" style="margin-left: 10%;">Application 2: Key Frames for Long Movie Production</h2>
      <div id="results-carousel" class="carousel results-carousel">
        <div class="item item-video1">
          <video poster="" id="video1" autoplay controls muted loop height="100%">
            <!-- Your video file here -->
            <source src="static/videos/demo_application_3.webm"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video2">
          <video poster="" id="video2" autoplay controls muted loop height="100%">
            <!-- Your video file here -->
            <source src="static/videos/demo_application_2.webm"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video3">
          <video poster="" id="video3" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/demo_application_1.webm"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video4">
          <video poster="" id="video4" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/demo_application_4.webm"
            type="video/mp4">
          </video>
        </div>
        <div class="item item-video5">
          <video poster="" id="video5" autoplay controls muted loop height="100%">\
            <!-- Your video file here -->
            <source src="static/videos/demo_application_5.webm"
            type="video/mp4">
          </video>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End video carousel -->


<!-- Paper poster -->
<section class="hero is-small">
  <div class="hero-body">
    <div class="container">
      <h2 class="title" style="margin-left: 10%;">Visualization</h2>
      <div class="chatbot-video">
      <iframe  src="static/pdfs/ACE-Visualization.pdf" width="100%" height="550">
          </iframe>
      </div>
      </div>
    </div>
  </section>
<!--End paper poster -->


<!--BibTex citation -->
  <section class="section" id="BibTeX">
    <div class="container">
      <div class="container" style="width: 80%">
      <h2 class="title">BibTeX</h2>
      <pre><code>
        @article{wanx_ace,
            title = {ACE: All-round Creator and Editor Following Instructions via Diffusion Transformer},
            author = {Han, Zhen and Jiang, Zeyinzi and Pan, Yulin and Zhang, Jingfeng and Mao, Chaojie and Xie, Chenwei and Liu, Yu and Zhou, Jingren},
            journal = {arXiv preprint arXiv:2410.00086},
            year = {2024}
        }
      </code></pre>
      </div>
    </div>
</section>
<!--End BibTex citation -->


  <footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">

          <p>
            This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project Page Template</a> which was adopted from the <a href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
            You are free to borrow the of this website, we just ask that you link back to this page in the footer. <br> This website is licensed under a <a rel="license"  href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>

        </div>
      </div>
    </div>
  </div>
</footer>

<!-- Statcounter tracking code -->
  
<!-- You can add a tracker to track page visits by creating an account at statcounter.com -->

    <!-- End of Statcounter Code -->

  </body>
  </html>