-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.html
402 lines (367 loc) · 18 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
<!-- Replace the content tag with appropriate information -->
<meta name="description" content="">
<meta property="og:title" content="ACE"/>
<meta property="og:description" content="All-round Creator and Editor Following Instructions via Diffusion Transformer"/>
<meta property="og:url" content=""/>
<!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
<!-- <meta property="og:image" content="static/image/your_banner_image.png" />-->
<!-- <meta property="og:image:width" content="1200"/>-->
<!-- <meta property="og:image:height" content="630"/>-->
<!-- <meta name="twitter:title" content="TWITTER BANNER TITLE META TAG">-->
<!-- <meta name="twitter:description" content="TWITTER BANNER DESCRIPTION META TAG">-->
<!-- <!– Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600–>-->
<!-- <meta name="twitter:image" content="static/images/your_twitter_banner_image.png">-->
<!-- <meta name="twitter:card" content="summary_large_image">-->
<!-- <!– Keywords for your paper to be indexed by–>-->
<!-- <meta name="keywords" content="KEYWORDS SHOULD BE PLACED HERE">-->
<!-- <meta name="viewport" content="width=device-width, initial-scale=1">-->
<title>ACE: All-round Creator and Editor Following Instructions via Diffusion Transformer</title>
<link rel="icon" type="image/x-icon" href="static/images/logo.png">
<link href=""
rel="stylesheet">
<link rel="stylesheet" href="static/css/bulma.min.css">
<link rel="stylesheet" href="static/css/bulma-carousel.min.css">
<link rel="stylesheet" href="static/css/bulma-slider.min.css">
<link rel="stylesheet" href="static/css/fontawesome.all.min.css">
<link rel="stylesheet"
href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
<link rel="stylesheet" href="static/css/index.css">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
<script defer src="static/js/fontawesome.all.min.js"></script>
<script src="static/js/bulma-carousel.min.js"></script>
<script src="static/js/bulma-slider.min.js"></script>
<script src="static/js/index.js"></script>
</head>
<body>
<!--TODO:发布时间预告-->
<section class="hero">
<div class="hero-body">
<div class="container is-max-desktop">
<div class="columns is-centered">
<div class="column has-text-centered">
<h1 class="title is-2 publication-title">
<img src="static/images/icon.png" class="title_icon">: All-round Creator and Editor Following Instructions via Diffusion Transformer</h1>
<div class="is-size-5 publication-authors">
<!-- Paper authors -->
<span class="author-block">
<a href="" target="_blank">Zhen Han</a><sup>*</sup>,</span>
<span class="author-block">
<a href="" target="_blank">Zeyinzi Jiang</a><sup>*</sup>,</span>
<span class="author-block">
<a href="" target="_blank">Yulin Pan</a><sup>*</sup>,</span>
<span class="author-block">
<a href="" target="_blank">Jingfeng Zhang</a><sup>*</sup>,</span>
<span class="author-block">
<a href="" target="_blank">Chaojie Mao</a><sup>*†</sup>,</span>
</div>
<div class="is-size-5 publication-authors">
<span class="author-block">
<a href="" target="_blank">Chenwei Xie</a>,</span>
<span class="author-block">
<a href="" target="_blank">Yu Liu</a>,</span>
<span class="author-block">
<a href="" target="_blank">Jingren Zhou</a></span>
</div>
<div class="is-size-5 publication-authors">
<span class="author-block">Tongyi Lab, Alibaba Group</span>
<span class="eql-cntrb"><small><br>
<sup>*</sup> Equal Contribution. Order is determined by random dice rolling.
<sup>†</sup> Project leader.</small></span>
</div>
<div class="is-size-5 publication-authors">
<span class="eql-cntrb"><small>
<b>Acknowledgments:</b> Haiming Zhao, Yuntao
Hong, You Wu, Jixuan Chen, Yuwei Wang, and Sheng Yao for their data contributions, and Lianghua
Huang, Kai Zhu, and Yutong Feng for their discussions, suggestions, and the sharing of resources.
</small></span>
</div>
<div class="column has-text-centered">
<div class="publication-links">
<!-- Github link -->
<span class="link-block">
<a href="https://github.com/ali-vilab/ACE/" target="_blank"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fab fa-github"></i>
</span>
<span>Code</span>
</a>
</span>
<!-- Scepter Github link -->
<span class="link-block">
<a href="https://github.com/modelscope/scepter" target="_blank"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
🪄
</span>
<span>Scepter</span>
</a>
</span>
<!-- Huggingface Space link -->
<span class="link-block">
<a href="https://huggingface.co/spaces/scepter-studio/ACE-Chat" target="_blank"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
🤗
</span>
<span>Demo</span>
</a>
</span>
<!-- ArXiv abstract Link -->
<span class="link-block">
<a href="https://arxiv.org/abs/2410.00086" target="_blank"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="ai ai-arxiv"></i>
</span>
<span>arXiv</span>
</a>
</span>
</div>
<div class="publication-links">
<!-- Huggingface Checkpoint link -->
<span class="link-block">
<a href="https://huggingface.co/scepter-studio/ACE-0.6B-512px" target="_blank"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
🤗
</span>
<span>Checkpoint</span>
</a>
</span>
<!-- ModelScope Checkpoint link -->
<span class="link-block">
<a href="https://www.modelscope.cn/models/iic/ACE-0.6B-512px" target="_blank"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<img src="static/images/modelscope_icon.png">
</span>
<span>Checkpoint</span>
</a>
</span>
</div>
</div>
</div>
</div>
</div>
<div class="container is-max-desktop">
<b><h4>[2024/11/01] 🔥 We release our <a href="https://huggingface.co/spaces/scepter-studio/ACE-Chat" target="_blank">ACE-Chat</a> on Huggingface Space.</h4></b>
<b><h4>[2024/11/01] 🔥 The ACE checkpoint has been uploaded to both <a href="https://www.modelscope.cn/models/iic/ACE-0.6B-512px" target="_blank">ModelScope</a> and <a href="https://huggingface.co/scepter-studio/ACE-0.6B-512px" target="_blank">HuggingFace</a> platforms.</h4></b>
<b><h4>[2024/11/05] 🔥 We release our <a href="https://github.com/ali-vilab/ACE/" target="_blank">ACE Code</a> on GitHub.</h4></b>
</div>
</div>
</section>
<!-- Teaser video-->
<section class="hero teaser is-light">
<div class="container is-max-desktop">
<div class="hero-body">
<img src="static/images/slogan_v2.gif" alt="slogan_v1"/>
<video poster="" id="tree" autoplay controls muted loop height="100%">
<!-- Your video here -->
<source src="static/videos/teaser_final.webm"
type="video/mp4">
</video>
<!-- <h2 class="subtitle has-text-centered">-->
<!-- Multi-turn image editing results of ACE. ACE supports a wide range of image editing-->
<!-- and generation tasks through natural language instructions, allowing complex editing requests to be-->
<!-- easily accomplished through multi-turn interactions.-->
<!-- </h2>-->
</div>
</div>
</section>
<!-- End teaser video -->
<!-- Paper abstract -->
<section class="section hero">
<div class="container is-max-desktop">
<div class="columns is-centered has-text-centered">
<div class="column is-four-fifths">
<h2 class="title is-3">Abstract</h2>
<div class="content has-text-justified">
<p>
Diffusion models have emerged as a powerful generative technology and have been found to be applicable in various scenarios.
Most existing foundational diffusion models are primarily designed for text-guided visual generation and do not support multi-modal conditions, which are essential for many visual editing tasks.
This limitation prevents these foundational diffusion models from serving as a unified model in the field of visual generation, like GPT-4 in the natural language processing field.
In this work, we propose ACE, an All-round Creator and Editor, which achieves comparable performance compared to those expert models in a wide range of visual generation tasks.
To achieve this goal, we first introduce a unified condition format termed Long-context Condition Unit (LCU), and propose a novel Transformer-based diffusion model that uses LCU as input, aiming for joint training across various generation and editing tasks.
Furthermore, we propose an efficient data collection approach to address the issue of the absence of available training data.
It involves acquiring pairwise images with synthesis-based or clustering-based pipelines and supplying these pairs with accurate textual instructions by leveraging a fine-tuned multi-modal large language model.
To comprehensively evaluate the performance of our model, we publish a benchmark of manually annotated image pairs across a variety of visual generation tasks.
The extensive experimental results demonstrate the superiority of our model in visual generation fields.
Thanks to the all-in-one capabilities of our model, we can easily build a chat system that responds to any image creation request using a single model to serve as the backend, avoiding the cumbersome pipeline typically employed in visual agents.
</p>
</div>
</div>
</div>
</div>
</section>
<!-- End paper abstract -->
<!-- Image carousel -->
<section class="hero is-small is-light">
<div class="hero-body">
<div class="container">
<div id="method-carousel" class="carousel results-carousel">
<div class="item">
<!-- Your image here -->
<img src="static/images/tasks.png" alt="MY ALT TEXT"/>
<h2 class="subtitle has-text-centered">
The overview of all generation and editing types, categorized based on different visual
modality input conditions, can be defined with the proposed input paradigm.
</h2>
</div>
<div class="item">
<!-- Your image here -->
<img src="static/images/method.png" alt="MY ALT TEXT"/>
</div>
<!-- <div class="item">-->
<!-- <!– Your image here –>-->
<!-- <img src="static/images/carousel3.jpg" alt="MY ALT TEXT"/>-->
<!-- <h2 class="subtitle has-text-centered">-->
<!-- Third image description.-->
<!-- </h2>-->
<!-- </div>-->
</div>
</div>
</div>
</section>
<!-- End image carousel -->
<!-- Video carousel -->
<section class="hero is-small">
<div class="hero-body">
<div class="container">
<h2 class="title is-3" style="margin-left: 10%;">Application 1: ChatBot</h2>
<!-- <h3 class="title is-5">-->
<!-- we build a Chat Bot application to achieve chat-based image generation and editing. Rather than cumbersome visual agent pipeline, our chat bot support all image-->
<!-- creation request with only one model serves as backend, hence achieves significant efficiency improvement compared with visual agent.-->
<!-- The Chat Bot's link is: xxxx-->
<!-- </h3>-->
<div class="chatbot-video">
<video poster="" id="chatbot_video" autoplay controls muted loop height="100%">
<!-- Your video file here -->
<source src="static/videos/demo_chat_crop_v3.webm"
type="video/mp4">
</video>
</div>
</div>
</div>
</section>
<!-- End video carousel -->
<!--<!– Youtube video –>-->
<!--<section class="hero is-small is-light">-->
<!-- <div class="hero-body">-->
<!-- <div class="container">-->
<!-- <!– Paper video. –>-->
<!-- <h2 class="title is-3">Video Presentation</h2>-->
<!-- <div class="columns is-centered has-text-centered">-->
<!-- <div class="column is-four-fifths">-->
<!-- <div class="publication-video">-->
<!-- <!– Youtube embed code here –>-->
<!-- <iframe src="https://www.youtube.com/embed/JkaxUblCGz0" frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>-->
<!-- </div>-->
<!-- </div>-->
<!-- </div>-->
<!-- </div>-->
<!-- </div>-->
<!--</section>-->
<!-- End youtube video -->
<!-- Video carousel -->
<section class="hero is-small is-light">
<div class="hero-body">
<div class="container">
<h2 class="title is-3" style="margin-left: 10%;">Application 2: Key Frames for Long Movie Production</h2>
<div id="results-carousel" class="carousel results-carousel">
<div class="item item-video1">
<video poster="" id="video1" autoplay controls muted loop height="100%">
<!-- Your video file here -->
<source src="static/videos/demo_application_3.webm"
type="video/mp4">
</video>
</div>
<div class="item item-video2">
<video poster="" id="video2" autoplay controls muted loop height="100%">
<!-- Your video file here -->
<source src="static/videos/demo_application_2.webm"
type="video/mp4">
</video>
</div>
<div class="item item-video3">
<video poster="" id="video3" autoplay controls muted loop height="100%">\
<!-- Your video file here -->
<source src="static/videos/demo_application_1.webm"
type="video/mp4">
</video>
</div>
<div class="item item-video4">
<video poster="" id="video4" autoplay controls muted loop height="100%">\
<!-- Your video file here -->
<source src="static/videos/demo_application_4.webm"
type="video/mp4">
</video>
</div>
<div class="item item-video5">
<video poster="" id="video5" autoplay controls muted loop height="100%">\
<!-- Your video file here -->
<source src="static/videos/demo_application_5.webm"
type="video/mp4">
</video>
</div>
</div>
</div>
</div>
</section>
<!-- End video carousel -->
<!-- Paper poster -->
<section class="hero is-small">
<div class="hero-body">
<div class="container">
<h2 class="title" style="margin-left: 10%;">Visualization</h2>
<div class="chatbot-video">
<iframe src="static/pdfs/ACE-Visualization.pdf" width="100%" height="550">
</iframe>
</div>
</div>
</div>
</section>
<!--End paper poster -->
<!--BibTex citation -->
<section class="section" id="BibTeX">
<div class="container">
<div class="container" style="width: 80%">
<h2 class="title">BibTeX</h2>
<pre><code>
@article{wanx_ace,
title = {ACE: All-round Creator and Editor Following Instructions via Diffusion Transformer},
author = {Han, Zhen and Jiang, Zeyinzi and Pan, Yulin and Zhang, Jingfeng and Mao, Chaojie and Xie, Chenwei and Liu, Yu and Zhou, Jingren},
journal = {arXiv preprint arXiv:2410.00086},
year = {2024}
}
</code></pre>
</div>
</div>
</section>
<!--End BibTex citation -->
<footer class="footer">
<div class="container">
<div class="columns is-centered">
<div class="column is-8">
<div class="content">
<p>
This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project Page Template</a> which was adopted from the <a href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
You are free to borrow the of this website, we just ask that you link back to this page in the footer. <br> This website is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
Commons Attribution-ShareAlike 4.0 International License</a>.
</p>
</div>
</div>
</div>
</div>
</footer>
<!-- Statcounter tracking code -->
<!-- You can add a tracker to track page visits by creating an account at statcounter.com -->
<!-- End of Statcounter Code -->
</body>
</html>