Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix hangups and VAD segmentation #157

Merged
merged 18 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
f4d2cfc
Fix hangups and VAD segmentation
royshil Aug 22, 2024
dcf368d
feat: Add max_sub_duration field to transcription filter data
royshil Aug 23, 2024
a93cb6a
chore: Update VAD parameters for better segmentation accuracy
royshil Aug 23, 2024
16bb8be
feat: Add segment_duration field to transcription filter data
royshil Aug 23, 2024
e572c35
feat: Optimize VAD processing for better performance
royshil Aug 24, 2024
a370081
feat: Refactor token buffer thread and whisper processing
royshil Aug 24, 2024
1f4bf65
Refactor token buffer thread and whisper processing
royshil Aug 27, 2024
3ad5df1
refactor: Update translation context in transcription filter
royshil Aug 28, 2024
0e3df02
refactor: Update last_text variable name in transcription filter call…
royshil Aug 29, 2024
c302d3a
feat: Add translation language utilities
royshil Aug 29, 2024
9861a6f
feat: Update ICU library configuration and dependencies
royshil Aug 29, 2024
851f6a5
refactor: Update ICU library configuration and dependencies
royshil Aug 30, 2024
8ef418c
refactor: Update ICU library configuration and dependencies
royshil Aug 30, 2024
f53151e
refactor: Update ICU library configuration and dependencies
royshil Aug 30, 2024
439e0ed
refactor: Update ICU library configuration and dependencies
royshil Aug 30, 2024
ca54053
refactor: Update ICU library configuration and dependencies
royshil Aug 30, 2024
1db1764
refactor: Update ICU library configuration and dependencies
royshil Aug 30, 2024
6f994e5
refactor: Update ICU library configuration and dependencies
royshil Aug 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,11 @@ else()
include(cmake/FetchOnnxruntime.cmake)
endif()

include(cmake/BuildICU.cmake)
# Add ICU to the target
target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE ICU)
target_include_directories(${CMAKE_PROJECT_NAME} SYSTEM PUBLIC ${ICU_INCLUDE_DIR})

target_sources(
${CMAKE_PROJECT_NAME}
PRIVATE src/plugin-main.c
Expand All @@ -114,9 +119,11 @@ target_sources(
src/whisper-utils/whisper-model-utils.cpp
src/whisper-utils/silero-vad-onnx.cpp
src/whisper-utils/token-buffer-thread.cpp
src/whisper-utils/vad-processing.cpp
src/translation/language_codes.cpp
src/translation/translation.cpp
src/translation/translation-utils.cpp
src/translation/translation-language-utils.cpp
src/ui/filter-replace-dialog.cpp)

set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name})
Expand All @@ -137,12 +144,14 @@ if(ENABLE_TESTS)
src/whisper-utils/whisper-utils.cpp
src/whisper-utils/silero-vad-onnx.cpp
src/whisper-utils/token-buffer-thread.cpp
src/whisper-utils/vad-processing.cpp
src/translation/language_codes.cpp
src/translation/translation.cpp)
src/translation/translation.cpp
src/translation/translation-language-utils.cpp)

find_libav(${CMAKE_PROJECT_NAME}-tests)

target_link_libraries(${CMAKE_PROJECT_NAME}-tests PRIVATE ct2 sentencepiece Whispercpp Ort OBS::libobs)
target_link_libraries(${CMAKE_PROJECT_NAME}-tests PRIVATE ct2 sentencepiece Whispercpp Ort OBS::libobs ICU)
target_include_directories(${CMAKE_PROJECT_NAME}-tests PRIVATE src)

# install the tests to the release/test directory
Expand Down
101 changes: 101 additions & 0 deletions cmake/BuildICU.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
include(FetchContent)
include(ExternalProject)

set(ICU_VERSION "75.1")
set(ICU_VERSION_UNDERSCORE "75_1")
set(ICU_VERSION_DASH "75-1")
set(ICU_VERSION_NO_MINOR "75")

if(WIN32)
set(ICU_URL
"https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION_DASH}/icu4c-${ICU_VERSION_UNDERSCORE}-Win64-MSVC2022.zip"
)
set(ICU_HASH "SHA256=7ac9c0dc6ccc1ec809c7d5689b8d831c5b8f6b11ecf70fdccc55f7ae8731ac8f")

FetchContent_Declare(
ICU_build
URL ${ICU_URL}
URL_HASH ${ICU_HASH})

FetchContent_MakeAvailable(ICU_build)

# Assuming the ZIP structure, adjust paths as necessary
set(ICU_INCLUDE_DIR "${icu_build_SOURCE_DIR}/include")
set(ICU_LIBRARY_DIR "${icu_build_SOURCE_DIR}/lib64")
set(ICU_BINARY_DIR "${icu_build_SOURCE_DIR}/bin64")

# Define the library names
set(ICU_LIBRARIES icudt icuuc icuin)

foreach(lib ${ICU_LIBRARIES})
# Add ICU library
find_library(
ICU_LIB_${lib}
NAMES ${lib}
PATHS ${ICU_LIBRARY_DIR}
NO_DEFAULT_PATH REQUIRED)
# find the dll
find_file(
ICU_DLL_${lib}
NAMES ${lib}${ICU_VERSION_NO_MINOR}.dll
PATHS ${ICU_BINARY_DIR}
NO_DEFAULT_PATH)
# Copy the DLLs to the output directory
install(FILES ${ICU_DLL_${lib}} DESTINATION "obs-plugins/64bit")
# add the library
add_library(ICU::${lib} SHARED IMPORTED GLOBAL)
set_target_properties(ICU::${lib} PROPERTIES IMPORTED_LOCATION "${ICU_LIB_${lib}}" IMPORTED_IMPLIB
"${ICU_LIB_${lib}}")
endforeach()
else()
set(ICU_URL
"https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION_DASH}/icu4c-${ICU_VERSION_UNDERSCORE}-src.tgz"
)
set(ICU_HASH "SHA256=cb968df3e4d2e87e8b11c49a5d01c787bd13b9545280fc6642f826527618caef")
if(APPLE)
set(ICU_PLATFORM "MacOSX")
set(TARGET_ARCH -arch\ $ENV{MACOS_ARCH})
set(ICU_BUILD_ENV_VARS CFLAGS=${TARGET_ARCH} CXXFLAGS=${TARGET_ARCH} LDFLAGS=${TARGET_ARCH})
else()
set(ICU_PLATFORM "Linux")
set(ICU_BUILD_ENV_VARS CFLAGS=-fPIC CXXFLAGS=-fPIC LDFLAGS=-fPIC)
endif()

ExternalProject_Add(
ICU_build
DOWNLOAD_EXTRACT_TIMESTAMP true
GIT_REPOSITORY "https://github.com/unicode-org/icu.git"
GIT_TAG "release-${ICU_VERSION_DASH}"
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${ICU_BUILD_ENV_VARS} <SOURCE_DIR>/icu4c/source/runConfigureICU
${ICU_PLATFORM} --prefix=<INSTALL_DIR> --enable-static --disable-shared
BUILD_COMMAND make -j4
BUILD_BYPRODUCTS
<INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}icudata${CMAKE_STATIC_LIBRARY_SUFFIX}
<INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}icuuc${CMAKE_STATIC_LIBRARY_SUFFIX}
<INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}icui18n${CMAKE_STATIC_LIBRARY_SUFFIX}
INSTALL_COMMAND make install
BUILD_IN_SOURCE 1)

ExternalProject_Get_Property(ICU_build INSTALL_DIR)

set(ICU_INCLUDE_DIR "${INSTALL_DIR}/include")
set(ICU_LIBRARY_DIR "${INSTALL_DIR}/lib")

set(ICU_LIBRARIES icudata icuuc icui18n)

foreach(lib ${ICU_LIBRARIES})
add_library(ICU::${lib} STATIC IMPORTED GLOBAL)
add_dependencies(ICU::${lib} ICU_build)
set(ICU_LIBRARY "${ICU_LIBRARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${lib}${CMAKE_STATIC_LIBRARY_SUFFIX}")
set_target_properties(ICU::${lib} PROPERTIES IMPORTED_LOCATION "${ICU_LIBRARY}" INTERFACE_INCLUDE_DIRECTORIES
"${ICU_INCLUDE_DIR}")
endforeach(lib ${ICU_LIBRARIES})
endif()

# Create an interface target for ICU
add_library(ICU INTERFACE)
add_dependencies(ICU ICU_build)
foreach(lib ${ICU_LIBRARIES})
target_link_libraries(ICU INTERFACE ICU::${lib})
endforeach()
target_include_directories(ICU SYSTEM INTERFACE $<BUILD_INTERFACE:${ICU_INCLUDE_DIR}>)
12 changes: 7 additions & 5 deletions data/locale/en-US.ini
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
LocalVocalPlugin="LocalVocal Plugin"
transcription_filterAudioFilter="LocalVocal Transcription"
vad_enabled="VAD Enabled"
vad_threshold="VAD Threshold"
log_level="Internal Log Level"
log_words="Log Output to Console"
caption_to_stream="Stream Captions"
step_by_step_processing="Step-by-step processing (⚠️ increased processing)"
step_size_msec="Step size (ms)"
subtitle_sources="Output Destination"
none_no_output="None / No output"
file_output_enable="Save to File"
Expand Down Expand Up @@ -51,7 +48,6 @@ translate="Translation"
translate_add_context="Translate with context"
whisper_translate="Translate to English (Whisper)"
buffer_size_msec="Buffer size (ms)"
overlap_size_msec="Overlap size (ms)"
suppress_sentences="Suppress sentences (each line)"
translate_output="Output Destination"
dtw_token_timestamps="DTW token timestamps"
Expand Down Expand Up @@ -85,4 +81,10 @@ buffered_output_parameters="Buffered Output Configuration"
file_output_info="Note: Translation output will be saved to a file in the same directory with the target language added to the name, e.g. 'output_es.srt'."
partial_transcription="Enable Partial Transcription"
partial_transcription_info="Partial transcription will increase processing load on your machine to transcribe content in real-time, which may impact performance."
partial_latency="Latency (ms)"
partial_latency="Latency (ms)"
vad_mode="VAD Mode"
Active_VAD="Active VAD"
Hybrid_VAD="Hybrid VAD"
translate_only_full_sentences="Translate only full sentences"
duration_filter_threshold="Duration filter"
segment_duration="Segment duration"
5 changes: 3 additions & 2 deletions src/tests/localvocal-offline-test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "transcription-filter.h"
#include "transcription-utils.h"
#include "whisper-utils/whisper-utils.h"
#include "whisper-utils/vad-processing.h"
#include "audio-file-utils.h"
#include "translation/language_codes.h"

Expand Down Expand Up @@ -148,7 +149,7 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
// },
// 30, std::chrono::seconds(10));

gf->vad_enabled = true;
gf->vad_mode = VAD_MODE_ACTIVE;
gf->log_words = true;
gf->caption_to_stream = false;
gf->start_timestamp_ms = now_ms();
Expand All @@ -157,7 +158,7 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
gf->buffered_output = false;

gf->target_lang = "";
gf->translation_ctx.add_context = true;
gf->translation_ctx.add_context = 1;
gf->translation_output = "";
gf->translate = false;
gf->sentence_psum_accept_thresh = 0.4;
Expand Down
52 changes: 37 additions & 15 deletions src/transcription-filter-callbacks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ std::string send_sentence_to_translation(const std::string &sentence,
struct transcription_filter_data *gf,
const std::string &source_language)
{
const std::string last_text = gf->last_text;
gf->last_text = sentence;
const std::string last_text = gf->last_text_for_translation;
gf->last_text_for_translation = sentence;
if (gf->translate && !sentence.empty()) {
obs_log(gf->log_level, "Translating text. %s -> %s", source_language.c_str(),
gf->target_lang.c_str());
Expand Down Expand Up @@ -199,11 +199,6 @@ void set_text_callback(struct transcription_filter_data *gf,
const DetectionResultWithText &resultIn)
{
DetectionResultWithText result = resultIn;
if (!result.text.empty() && (result.result == DETECTION_RESULT_SPEECH ||
result.result == DETECTION_RESULT_PARTIAL)) {
gf->last_sub_render_time = now_ms();
gf->cleared_last_sub = false;
}

std::string str_copy = result.text;

Expand Down Expand Up @@ -233,20 +228,25 @@ void set_text_callback(struct transcription_filter_data *gf,
}
}

bool should_translate =
gf->translate_only_full_sentences ? result.result == DETECTION_RESULT_SPEECH : true;

// send the sentence to translation (if enabled)
std::string translated_sentence =
send_sentence_to_translation(str_copy, gf, result.language);
should_translate ? send_sentence_to_translation(str_copy, gf, result.language) : "";

if (gf->translate) {
if (gf->translation_output == "none") {
// overwrite the original text with the translated text
str_copy = translated_sentence;
} else {
if (gf->buffered_output) {
if (result.result == DETECTION_RESULT_SPEECH) {
// buffered output - add the sentence to the monitor
gf->translation_monitor.addSentence(translated_sentence);
}
// buffered output - add the sentence to the monitor
gf->translation_monitor.addSentenceFromStdString(
translated_sentence,
get_time_point_from_ms(result.start_timestamp_ms),
get_time_point_from_ms(result.end_timestamp_ms),
result.result == DETECTION_RESULT_PARTIAL);
} else {
// non-buffered output - send the sentence to the selected source
send_caption_to_source(gf->translation_output, translated_sentence,
Expand All @@ -256,9 +256,10 @@ void set_text_callback(struct transcription_filter_data *gf,
}

if (gf->buffered_output) {
if (result.result == DETECTION_RESULT_SPEECH) {
gf->captions_monitor.addSentence(str_copy);
}
gf->captions_monitor.addSentenceFromStdString(
str_copy, get_time_point_from_ms(result.start_timestamp_ms),
get_time_point_from_ms(result.end_timestamp_ms),
result.result == DETECTION_RESULT_PARTIAL);
} else {
// non-buffered output - send the sentence to the selected source
send_caption_to_source(gf->text_source_name, str_copy, gf);
Expand All @@ -273,6 +274,21 @@ void set_text_callback(struct transcription_filter_data *gf,
result.result == DETECTION_RESULT_SPEECH) {
send_sentence_to_file(gf, result, str_copy, translated_sentence);
}

if (!result.text.empty() && (result.result == DETECTION_RESULT_SPEECH ||
result.result == DETECTION_RESULT_PARTIAL)) {
gf->last_sub_render_time = now_ms();
gf->cleared_last_sub = false;
if (result.result == DETECTION_RESULT_SPEECH) {
// save the last subtitle if it was a full sentence
gf->last_transcription_sentence.push_back(result.text);
// remove the oldest sentence if the buffer is too long
while (gf->last_transcription_sentence.size() >
(size_t)gf->n_context_sentences) {
gf->last_transcription_sentence.pop_front();
}
}
}
};

void recording_state_callback(enum obs_frontend_event event, void *data)
Expand Down Expand Up @@ -314,6 +330,12 @@ void reset_caption_state(transcription_filter_data *gf_)
}
send_caption_to_source(gf_->text_source_name, "", gf_);
send_caption_to_source(gf_->translation_output, "", gf_);
// reset translation context
gf_->last_text_for_translation = "";
gf_->last_text_translation = "";
gf_->translation_ctx.last_input_tokens.clear();
gf_->translation_ctx.last_translation_tokens.clear();
gf_->last_transcription_sentence.clear();
// flush the buffer
{
std::lock_guard<std::mutex> lock(gf_->whisper_buf_mutex);
Expand Down
13 changes: 11 additions & 2 deletions src/transcription-filter-data.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ struct transcription_filter_data {
size_t sentence_number;
// Minimal subtitle duration in ms
size_t min_sub_duration;
// Maximal subtitle duration in ms
size_t max_sub_duration;
// Last time a subtitle was rendered
uint64_t last_sub_render_time;
bool cleared_last_sub;
Expand All @@ -62,7 +64,7 @@ struct transcription_filter_data {
float sentence_psum_accept_thresh;

bool do_silence;
bool vad_enabled;
int vad_mode;
int log_level = LOG_DEBUG;
bool log_words;
bool caption_to_stream;
Expand All @@ -84,11 +86,17 @@ struct transcription_filter_data {
bool initial_creation = true;
bool partial_transcription = false;
int partial_latency = 1000;
float duration_filter_threshold = 2.25f;
int segment_duration = 7000;

// Last transcription result
std::string last_text;
std::string last_text_for_translation;
std::string last_text_translation;

// Transcription context sentences
int n_context_sentences;
std::deque<std::string> last_transcription_sentence;

// Text source to output the subtitles
std::string text_source_name;
// Callback to set the text in the output text source (subtitles)
Expand All @@ -110,6 +118,7 @@ struct transcription_filter_data {
struct translation_context translation_ctx;
std::string translation_model_index;
std::string translation_model_path_external;
bool translate_only_full_sentences;

bool buffered_output = false;
TokenBufferThread captions_monitor;
Expand Down
Loading
Loading