-
Notifications
You must be signed in to change notification settings - Fork 1
/
index.py
207 lines (189 loc) · 7.82 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import inspect
import os
from pathlib import Path
import dominate
from dominate.tags import *
from dominate.util import raw
from templates import header, authors_row
# Load components
import abstract
import prompt
import inpaint
import demo_compare
import demo_compare_meta
import celeb
import edit
# Where to save the generated file.
root_path = Path(inspect.getfile(inspect.currentframe())).parent
doc = dominate.document(title=None)
with doc.head:
meta(charset="utf-8")
meta(http_equiv="X-UA-Compatible", content="IE=edge")
meta(name="viewport", content="width=device-width, initial-scale=1")
title("ARDiT TTS Demo")
link(
href="/ardit-web/statics/bootstrap-5.2.3-dist/css/bootstrap.min.css",
rel="stylesheet",
)
link(href="/ardit-web/statics/my.css", rel="stylesheet")
with doc:
# Title and Metadata:
with div(cls="container").add(div(cls="row")):
with div(cls="container pt-5 mt-5 shadow p-5 mb-5 bg-white rounded"):
header(
title="Autoregressive Diffusion Transformer for Text-to-Speech Synthesis",
sub="",
)
br()
abstract.section_abstract()
p(
"You can download all audio files on this page by cloning this ",
a(
"github repository",
href="https://github.com/zjlww/ardit-web",
),
".",
cls="lead",
)
with div(cls="container pt-5 mt-5 shadow p-5 mb-5 bg-white rounded"):
h3("Prompted Generation")
p(
"""
In this task, we evaluate on test set B. We pick a prompt and a target utterance from the same speaker. The models generate target waveforms with prompt waveforms and the transcript of both sentences.
All speakers are unseen for all systems during training.
""",
cls="lead",
)
prompt.get_table()
p(
"* please scroll horizontally to explore additional columns in the table.",
cls="lead",
)
with div(cls="container pt-5 mt-5 shadow p-5 mb-5 bg-white rounded"):
h3("Speech Inpainting")
p(
"""
We evaluated the performance of text-based speech editing on the speech inpainting task.
The models generate complete waveforms given complete texts and partially masked waveforms. The masked sections are highlighted within the text.
All speakers were unseen by all systems during training. The following 20 test cases are from test set C (long).
""",
cls="lead",
)
inpaint.get_table()
p(
"* please scroll horizontally to explore additional columns in the table.",
cls="lead",
)
with div(cls="container pt-5 mt-5 shadow p-5 mb-5 bg-white rounded"):
h3("Prompted Generation (Comparing with Proprietary Systems I)")
p(
"""
In this section, we compare our system with proprietary systems including NaturalSpeech 2/3, MegaTTS 2, UniAudio, CLaM-TTS, VoiceBox, and VALL-E. The source codes and model weights for these models are not available.
The following samples are obtained from their online demo pages. All waveforms are downsampled to 16kHz.
Please note that ARDiT's performance is influenced by the fact that the prompt waveforms are in 16kHz, not 24kHz, and the prompt texts are not semantically coherent with the target texts.
""",
cls="lead",
)
p(
"1~4 are obtained from ",
a(
"NaturalSpeech 3",
href="https://speechresearch.github.io/naturalspeech3/",
),
" and 5~20 are obtained from ",
a("CLaM-TTS", href="https://clam-tts.github.io/"),
"'s demo page.",
cls="lead",
)
demo_compare.get_table()
p(
"* please scroll horizontally to explore additional columns in the table.",
cls="lead",
)
with div(cls="container pt-5 mt-5 shadow p-5 mb-5 bg-white rounded"):
h3("Prompted Generation (Comparing with Proprietary Systems II)")
p(
"""
In this section, we compare our system with proprietary Flow Matching based TTS systems including VoiceBox and SpeechFlow. The source codes and model weights for these models are not available.
The following samples are obtained from their online demo pages. All waveforms are downsampled to 16kHz.
Please note that ARDiT's performance is influenced by the fact that the prompt waveforms are in 16kHz, not 24kHz, and the prompt texts are not semantically coherent with the target texts.
""",
cls="lead",
)
p(
"Audio samples are obtained from ",
a("voicebox.metademolab.com", href="https://voicebox.metademolab.com/"),
cls="lead",
)
demo_compare_meta.get_table()
p(
"* please scroll horizontally to explore additional columns in the table.",
cls="lead",
)
with div(cls="container pt-5 mt-5 shadow p-5 mb-5 bg-white rounded"):
h3("Prompted Generation (Celebrities and Game Characters)")
p(
"""
ARDiT trained only on LibriTTS is capable of imitating famous figures' voice.
""",
cls="lead",
)
p(
"Prompts and baseline results are obtained from ",
a("Mega-TTS", href="https://mega-tts.github.io/demo-page/"),
" and ",
a("CLaM-TTS", href="https://clam-tts.github.io/"),
"'s demo pages.",
cls="lead",
)
celeb.get_table()
p(
"* please scroll horizontally to explore additional columns in the table.",
cls="lead",
)
with div(cls="container pt-5 mt-5 shadow p-5 mb-5 bg-white rounded"):
h3("Speech Editing")
p(
"""In this section, we compare the speech editing performance of ARDiT with VoiceBox's demo.""",
cls="lead",
)
p(
"""The following audio samples are obtained from """,
a(
"VoiceCraft's demo page",
href="https://jasonppy.github.io/VoiceCraft_web/",
),
".",
cls="lead",
)
edit.get_table()
with div(cls="container pt-5 mt-5 shadow p-5 mb-5 bg-white rounded"):
import rate_control
h3("Speech Rate Control")
p(
"""ARDiT TTS can control the output speech rate to some extent, by controlling the total audio duration.""",
cls="lead",
)
rate_control.get_table()
with doc.footer:
script(src="/ardit-web/statics/jquery/jquery-3.7.1.slim.min.js")
script(src="/ardit-web/statics/bootstrap-5.2.3-dist/bootstrap.min.js")
# Script for allowing only one audio to play at the same time:
doc.children.append(
script(
raw(
"""
$(function(){
$("audio").on("play", function() {
$("audio").not(this).each(function(index, audio) {
audio.pause();
audio.currentTime = 0;
});
});
});
"""
)
)
)
with open(root_path / "index.html", "w") as index:
index.write(doc.render())