From 2f069335ab289bb8de74f2460ccb52ebaf02a5af Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 8 Oct 2022 10:56:59 +0300 Subject: [PATCH] Adding sanitizer tests --- .github/workflows/build.yml | 6 ++-- .gitignore | 1 + tests/CMakeLists.txt | 62 +++++++++++++++++++++++++++++++++++++ whisper.cpp | 11 +++++-- whisper.h | 11 ++++--- 5 files changed, 81 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index effa8db3226..f1b63272d08 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -61,7 +61,7 @@ jobs: - name: Build run: | make - ctest --output-on-failure + ctest -L gh --output-on-failure ubuntu-latest-clang: runs-on: ubuntu-latest @@ -87,7 +87,7 @@ jobs: - name: Build run: | make - ctest --output-on-failure + ctest -L gh --output-on-failure ubuntu-latest-gcc-sanitized: runs-on: ubuntu-latest @@ -112,4 +112,4 @@ jobs: - name: Build run: | make - ctest --output-on-failure + ctest -L gh --output-on-failure diff --git a/.gitignore b/.gitignore index 23b28c178d8..860e0d915e8 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ stream *.o .cache build/ +compile_commands.json diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e69de29bb2d..b51fbbb6b56 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -0,0 +1,62 @@ +set(TEST_TARGET test-main-tiny) +add_test(NAME ${TEST_TARGET} + COMMAND $ + -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-tiny.bin + -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav) +set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "tiny;gh") + +set(TEST_TARGET test-main-tiny.en) +add_test(NAME ${TEST_TARGET} + COMMAND $ + -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-tiny.en.bin + -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav) +set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "tiny;en;gh") + +set(TEST_TARGET test-main-base) +add_test(NAME ${TEST_TARGET} + COMMAND $ + -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-base.bin + -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav) +set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "base") + +set(TEST_TARGET test-main-base.en) +add_test(NAME ${TEST_TARGET} + COMMAND $ + -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-base.en.bin + -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav) +set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "base;en") + +set(TEST_TARGET test-main-small) +add_test(NAME ${TEST_TARGET} + COMMAND $ + -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-small.bin + -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav) +set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "small") + +set(TEST_TARGET test-main-small.en) +add_test(NAME ${TEST_TARGET} + COMMAND $ + -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-small.en.bin + -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav) +set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "small;en") + +set(TEST_TARGET test-main-medium) +add_test(NAME ${TEST_TARGET} + COMMAND $ + -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-medium.bin + -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav) +set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "medium") + +set(TEST_TARGET test-main-medium.en) +add_test(NAME ${TEST_TARGET} + COMMAND $ + -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-medium.en.bin + -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav) +set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "medium;en") + +set(TEST_TARGET test-main-large) +add_test(NAME ${TEST_TARGET} + COMMAND $ + -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-large.bin + -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav) +set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "large") diff --git a/whisper.cpp b/whisper.cpp index cb15b986844..cdf76beb19f 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -950,6 +950,7 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) { // load weights { + int n_loaded = 0; size_t total_size = 0; while (true) { @@ -1004,9 +1005,17 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) { //printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); total_size += ggml_nbytes(tensor); + n_loaded++; } printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); + + if (n_loaded == 0) { + printf("%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__); + } else if (n_loaded != model.tensors.size()) { + fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), n_loaded); + return false; + } } fin.close(); @@ -1772,8 +1781,6 @@ bool whisper_decode( } // the most basic sampling scheme - select the top token -// TODO: beam search -// TODO: temperature whisper_vocab::id whisper_sample_best( const whisper_vocab & vocab, const float * probs, bool need_timestamp) { diff --git a/whisper.h b/whisper.h index 79df0e04a26..3f8ddc978e1 100644 --- a/whisper.h +++ b/whisper.h @@ -71,11 +71,12 @@ extern "C" { // return the id of the specified language, returns -1 if not found WHISPER_API int whisper_lang_id(const char * lang); - WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length - WHISPER_API int whisper_n_vocab (struct whisper_context * ctx); - WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx); - WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx); - WHISPER_API float * whisper_get_probs (struct whisper_context * ctx); + WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length + WHISPER_API int whisper_n_vocab (struct whisper_context * ctx); + WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx); + WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx); + + WHISPER_API float * whisper_get_probs(struct whisper_context * ctx); WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);