Skip to content

Commit

Permalink
Merge pull request #8 from PaddyWan/phonetic_add
Browse files Browse the repository at this point in the history
Enable to have models that use phonetic alphabet.
  • Loading branch information
maxilevi authored Aug 21, 2024
2 parents 27d448c + 1b3b588 commit 4e46d7e
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 15 deletions.
10 changes: 8 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,20 @@ target_include_directories(vits PRIVATE ggml/include)

target_link_libraries(vits PRIVATE ggml)

find_library(LIBESPEAK_LIBRARIES espeak-ng)
if(LIBESPEAK_LIBRARIES)
find_package(PkgConfig REQUIRED)
pkg_check_modules(ESPEAK REQUIRED espeak-ng)
add_definitions(-DVITS_ESPEAK)
endif()

if (NOT CMAKE_SYSTEM_NAME STREQUAL "iOS")
add_executable(main test/main.cpp)

add_executable(bench_e2e test/bench_e2e.cpp)

target_include_directories(main PRIVATE src/include ggml/include)
target_link_libraries(main PRIVATE vits ggml)
target_include_directories(main PRIVATE src/include ggml/include ${ESPEAK_INCLUDE_DIRS})
target_link_libraries(main PRIVATE vits ggml ${ESPEAK_LIBRARIES})

target_include_directories(bench_e2e PRIVATE src/include ggml/include)
target_link_libraries(bench_e2e PRIVATE vits ggml)
Expand Down
2 changes: 2 additions & 0 deletions src/include/vits_tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@ class vits_tokenizer {
int convert_token_to_id(const std::string& token);

static std::unique_ptr<vits_tokenizer> load(std::istream& file);
void set_phonetic();

private:
std::unordered_map<std::string, int32_t> vocab;
bool add_blank;
bool normalize;
bool phonetic = false;
std::string pad_token;
std::string unk_token;
};
Expand Down
4 changes: 4 additions & 0 deletions src/vits_model_data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ std::tuple<std::unordered_map<std::string, ggml_tensor*>, std::unordered_map<std
}
printf("Loaded %lu tensors\n", tensors.size());

auto it = config.find("phonetic");
if(it != config.end() && it->second == "1")
tokenizer->set_phonetic();

return std::make_tuple(tensors, config, std::move(tokenizer));
}

Expand Down
129 changes: 116 additions & 13 deletions src/vits_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@
#include "include/common.h"
#include "include/debug.h"


#ifdef VITS_ESPEAK
#include <cstring>
#include <espeak-ng/speak_lib.h>
#endif

vits_tokenizer::vits_tokenizer() {

}
Expand Down Expand Up @@ -112,7 +118,72 @@ std::string vits_tokenizer::prepare_for_tokenization(const std::string& text, bo
return {processed_text, {}};
}

#ifdef VITS_ESPEAK
static bool init_espeak = false;
std::vector<char> convert_to_phonetic(char* b, char* e)
{
std::vector<char> ret;
ret.reserve(2*std::distance(b,e));
while(b != e)
{
const std::size_t pos = std::basic_string_view(b,e).find_first_of("!\\,.:;?");
if(pos != std::string::npos)
{
const char c = b[pos];
b[pos] = 0;
const char* t = b;
const char* rb = espeak_TextToPhonemes((const void**)&t, espeakCHARS_8BIT, 2);
ret.insert(ret.end(), rb, rb+std::strlen(rb));
ret.push_back(c);
b+=(pos+1);
if(c == '.' && b[0] == '.' && b[1] == '.')
{
ret.push_back('.');
ret.push_back('.');
b+=2;
}
if(b != e)
ret.push_back(' ');

}
else
{
const char* t = b;
const char* rb = espeak_TextToPhonemes((const void**)&t, espeakCHARS_8BIT, 2);
ret.insert(ret.end(), rb, rb+std::strlen(rb));
b=e;
}
}
return ret;
}

void vits_tokenizer::set_phonetic()
{
if(!init_espeak)
{
espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, nullptr, 0);
espeak_VOICE voice{nullptr, nullptr, nullptr, 0, 0};
voice.languages="en-us";
espeak_ERROR result = espeak_SetVoiceByProperties(&voice);
ASSERT(result == EE_OK, "Espeak did not correctly initialize");
init_espeak = true;
//we have no espeak_Terminate();
}
phonetic = true;
}
#else
void vits_tokenizer::set_phonetic()
{
ASSERT(false, "Espeak is not available");
}
#endif


std::vector<int32_t> vits_tokenizer::tokenize(const std::string& text) {
#ifdef VITS_ESPEAK
if(!phonetic)
{
#endif
//auto normalized_text = this->prepare_for_tokenization(text, false, normalize);
//printf("Normalized text: %s\n", normalized_text.c_str());
/*
Expand All @@ -121,20 +192,52 @@ std::vector<int32_t> vits_tokenizer::tokenize(const std::string& text) {
token_strs.push_back(std::string(1, c));
}
*/
std::string processed_text = text;
std::transform(processed_text.begin(), processed_text.end(), processed_text.begin(),
[](unsigned char c){ return std::tolower(c); });

std::vector<int32_t> tokens = tokenize_fast(processed_text);
std::vector<int32_t> tokens_final;
if (add_blank) {
std::vector<int32_t> interspersed(tokens.size() * 2 + 1, vocab[pad_token]);
for (size_t i = 0; i < tokens.size(); ++i) {
interspersed[i * 2 + 1] = tokens[i];
}
tokens_final = interspersed;
std::string processed_text = text;
std::transform(processed_text.begin(), processed_text.end(), processed_text.begin(),
[](unsigned char c){ return std::tolower(c); });

std::vector<int32_t> tokens = tokenize_fast(processed_text);
std::vector<int32_t> tokens_final;
if (add_blank) {
std::vector<int32_t> interspersed(tokens.size() * 2 + 1, vocab[pad_token]);
for (size_t i = 0; i < tokens.size(); ++i) {
interspersed[i * 2 + 1] = tokens[i];
}
tokens_final = interspersed;
}
return tokens_final;
#ifdef VITS_ESPEAK
}
else
{
std::string copy(text);
std::vector<char> phonetic_text = convert_to_phonetic(copy.data(), copy.data()+copy.size());
std::vector<int32_t> tokens;
tokens.reserve(phonetic_text.size()*(add_blank?2:1) + 1);
if(add_blank)
tokens.push_back(0);
std::basic_string_view v(phonetic_text.begin(), phonetic_text.end());
while(!v.empty())
{
auto it = vocab.begin();
while(it != vocab.end() && !v.starts_with(it->first))
++it;
if(it == vocab.end())
{
tokens.push_back(0);//did not find
v.remove_prefix(1);
}
else
{
tokens.push_back(it->second);
v.remove_prefix(it->first.size());
}
if(add_blank)
tokens.push_back(0);
}
return tokens;
}
return tokens_final;
#endif
}

int vits_tokenizer::convert_token_to_id(const std::string& token) {
Expand Down

0 comments on commit 4e46d7e

Please sign in to comment.