diff --git a/examples/models/parakeet/README.md b/examples/models/parakeet/README.md index 713dcf616e7..5c0cbfb119d 100644 --- a/examples/models/parakeet/README.md +++ b/examples/models/parakeet/README.md @@ -98,6 +98,8 @@ DYLD_LIBRARY_PATH=/usr/lib ./cmake-out/examples/models/parakeet/parakeet_runner |----------|-------------| | `--model_path` | Path to Parakeet model (.pte) | | `--audio_path` | Path to input audio file (.wav) | -| `--tokenizer_path` | Path to tokenizer file (default: `tokenizer.json`) | +| `--tokenizer_path` | Path to tokenizer file (default: `tokenizer.model`) | | `--data_path` | Path to data file (.ptd) for delegate data (optional, required for Metal/CUDA) | -| `--timestamps` | Timestamp output mode: `none\|token\|word\|segment\|all` (default: `segment`) | +| `--timestamps` | Timestamp output mode: `none\|token\|word\|segment\|all` (default: `segment`) | +| `--repeat` | Number of timed inference iterations for benchmarking (default: `1`) | +| `--warmup_repeat` | Number of warmup iterations before timed benchmark (default: `1`) | diff --git a/examples/models/parakeet/main.cpp b/examples/models/parakeet/main.cpp index d9d4de8f68c..e8b5845f826 100644 --- a/examples/models/parakeet/main.cpp +++ b/examples/models/parakeet/main.cpp @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -47,6 +48,11 @@ DEFINE_string( timestamps, "segment", "Timestamp output mode: none|token|word|segment|all"); +DEFINE_int32(repeat, 1, "Number of times to run inference (for benchmarking)."); +DEFINE_int32( + warmup_repeat, + 1, + "Number of warmup iterations to initialize backends before timed runs."); using ::executorch::extension::from_blob; using ::executorch::extension::Module; @@ -59,6 +65,7 @@ using ::parakeet::TokenId; using ::parakeet::TokenWithTextInfo; namespace { + // TDT duration values const std::vector DURATIONS = {0, 1, 2, 3, 4}; @@ -105,19 +112,277 @@ TimestampOutputMode parse_timestamp_output_mode(const std::string& raw_arg) { "'. Expected: token, word, segment, all."); } -std::vector greedy_decode_executorch( - Module& model, +} // namespace + +// ============================================================================ +// ParakeetRunner: Encapsulates model loading and inference +// ============================================================================ + +struct ParakeetConfig { + int64_t vocab_size; + int64_t blank_id; + int64_t num_rnn_layers; + int64_t pred_hidden; + int64_t sample_rate; + double window_stride; + int64_t encoder_subsampling_factor; +}; + +struct TranscriptionResult { + std::string text; + std::vector tokens; + std::vector token_timestamps; + std::vector word_timestamps; + std::vector segment_timestamps; + + // Timing information (in milliseconds) + struct Timing { + int64_t preprocessor_ms = 0; + int64_t encoder_ms = 0; + int64_t decoder_total_ms = 0; + int64_t decoder_step_ms = 0; // Time in decoder_step calls + int64_t joint_ms = 0; // Time in joint calls + + // Computed: overhead from tensor creation, memcpy, loop control + int64_t loop_overhead_ms() const { + return decoder_total_ms - decoder_step_ms - joint_ms; + } + } timing; +}; + +class ParakeetRunner { + public: + // Factory method to load model, tokenizer, and cache metadata + static std::unique_ptr load( + const std::string& model_path, + const std::string& tokenizer_path, + const std::string& data_path = ""); + + // Run inference on audio samples + TranscriptionResult transcribe(const std::vector& audio_samples); + + // Run inference on audio file + TranscriptionResult transcribe(const std::string& audio_path); + + // Accessors for timestamp computation + const ParakeetConfig& config() const { + return config_; + } + double frame_to_seconds() const { + return config_.window_stride * + static_cast(config_.encoder_subsampling_factor); + } + + private: + ParakeetRunner() = default; + + // Greedy TDT decoding + std::vector greedy_decode( + const ::executorch::aten::Tensor& f_proj, + int64_t encoder_len, + TranscriptionResult::Timing& timing); + + std::unique_ptr model_; + std::unique_ptr<::tokenizers::Tokenizer> tokenizer_; + ParakeetConfig config_; + std::unordered_set supported_punctuation_; +}; + +// ---------------------------------------------------------------------------- +// ParakeetRunner Implementation +// ---------------------------------------------------------------------------- + +std::unique_ptr ParakeetRunner::load( + const std::string& model_path, + const std::string& tokenizer_path, + const std::string& data_path) { + auto runner = std::unique_ptr(new ParakeetRunner()); + + // Load model + ET_LOG(Info, "Loading model from: %s", model_path.c_str()); + if (!data_path.empty()) { + ET_LOG(Info, "Loading data from: %s", data_path.c_str()); + runner->model_ = + std::make_unique(model_path, data_path, Module::LoadMode::Mmap); + } else { + runner->model_ = + std::make_unique(model_path, Module::LoadMode::Mmap); + } + + auto load_error = runner->model_->load(); + if (load_error != Error::Ok) { + ET_LOG(Error, "Failed to load model."); + return nullptr; + } + + // Load tokenizer + ET_LOG(Info, "Loading tokenizer from: %s", tokenizer_path.c_str()); + runner->tokenizer_ = + ::executorch::extension::llm::load_tokenizer(tokenizer_path); + if (!runner->tokenizer_ || !runner->tokenizer_->is_loaded()) { + ET_LOG(Error, "Failed to load tokenizer from: %s", tokenizer_path.c_str()); + return nullptr; + } + + // Query model metadata + std::vector empty_inputs; + auto num_rnn_layers_result = + runner->model_->execute("num_rnn_layers", empty_inputs); + auto pred_hidden_result = + runner->model_->execute("pred_hidden", empty_inputs); + auto vocab_size_result = runner->model_->execute("vocab_size", empty_inputs); + auto blank_id_result = runner->model_->execute("blank_id", empty_inputs); + auto sample_rate_result = + runner->model_->execute("sample_rate", empty_inputs); + auto window_stride_result = + runner->model_->execute("window_stride", empty_inputs); + auto encoder_subsampling_factor_result = + runner->model_->execute("encoder_subsampling_factor", empty_inputs); + + if (!num_rnn_layers_result.ok() || !pred_hidden_result.ok() || + !vocab_size_result.ok() || !blank_id_result.ok() || + !sample_rate_result.ok() || !window_stride_result.ok() || + !encoder_subsampling_factor_result.ok()) { + ET_LOG( + Error, + "Failed to query model metadata. Make sure the model was exported with constant_methods."); + return nullptr; + } + + runner->config_.vocab_size = vocab_size_result.get()[0].toInt(); + runner->config_.blank_id = blank_id_result.get()[0].toInt(); + runner->config_.num_rnn_layers = num_rnn_layers_result.get()[0].toInt(); + runner->config_.pred_hidden = pred_hidden_result.get()[0].toInt(); + runner->config_.sample_rate = sample_rate_result.get()[0].toInt(); + runner->config_.window_stride = window_stride_result.get()[0].toDouble(); + runner->config_.encoder_subsampling_factor = + encoder_subsampling_factor_result.get()[0].toInt(); + + ET_LOG( + Info, + "Model metadata: vocab_size=%lld, blank_id=%lld, num_rnn_layers=%lld, " + "pred_hidden=%lld, sample_rate=%lld, window_stride=%.6f, " + "encoder_subsampling_factor=%lld", + static_cast(runner->config_.vocab_size), + static_cast(runner->config_.blank_id), + static_cast(runner->config_.num_rnn_layers), + static_cast(runner->config_.pred_hidden), + static_cast(runner->config_.sample_rate), + runner->config_.window_stride, + static_cast(runner->config_.encoder_subsampling_factor)); + + // Derive supported punctuation for timestamp computation + runner->supported_punctuation_ = + parakeet::tokenizer_utils::derive_supported_punctuation( + *runner->tokenizer_); + + return runner; +} + +TranscriptionResult ParakeetRunner::transcribe( + const std::vector& audio_samples) { + TranscriptionResult result; + + // Create audio tensor (from_blob requires non-const but doesn't modify data) + auto audio_tensor = from_blob( + const_cast(audio_samples.data()), + {static_cast<::executorch::aten::SizesType>(audio_samples.size())}, + ::executorch::aten::ScalarType::Float); + std::vector audio_len_data = { + static_cast(audio_samples.size())}; + auto audio_len_tensor = from_blob( + audio_len_data.data(), {1}, ::executorch::aten::ScalarType::Long); + + // Run preprocessor + auto proc_start = std::chrono::high_resolution_clock::now(); + auto proc_result = model_->execute( + "preprocessor", std::vector{audio_tensor, audio_len_tensor}); + auto proc_end = std::chrono::high_resolution_clock::now(); + result.timing.preprocessor_ms = + std::chrono::duration_cast( + proc_end - proc_start) + .count(); + + if (!proc_result.ok()) { + ET_LOG(Error, "Preprocessor forward failed."); + return result; + } + auto& proc_outputs = proc_result.get(); + auto mel = proc_outputs[0].toTensor(); + int64_t mel_len_value = + proc_outputs[1].toTensor().const_data_ptr()[0]; + + // Run encoder + std::vector mel_len_data = {mel_len_value}; + auto mel_len = + from_blob(mel_len_data.data(), {1}, ::executorch::aten::ScalarType::Long); + + auto enc_start = std::chrono::high_resolution_clock::now(); + auto enc_result = + model_->execute("encoder", std::vector{mel, mel_len}); + auto enc_end = std::chrono::high_resolution_clock::now(); + result.timing.encoder_ms = + std::chrono::duration_cast(enc_end - enc_start) + .count(); + + if (!enc_result.ok()) { + ET_LOG(Error, "Encoder forward failed."); + return result; + } + auto& enc_outputs = enc_result.get(); + auto f_proj = enc_outputs[0].toTensor(); + int64_t encoded_len = enc_outputs[1].toTensor().const_data_ptr()[0]; + + // Run greedy decode + auto decode_start = std::chrono::high_resolution_clock::now(); + result.tokens = greedy_decode(f_proj, encoded_len, result.timing); + auto decode_end = std::chrono::high_resolution_clock::now(); + result.timing.decoder_total_ms = + std::chrono::duration_cast( + decode_end - decode_start) + .count(); + + // Convert tokens to text + result.text = parakeet::tokenizer_utils::decode_token_sequence( + result.tokens, *tokenizer_); + + // Compute timestamps + try { + result.token_timestamps = + parakeet::timestamp_utils::get_tokens_with_text_info( + result.tokens, *tokenizer_, supported_punctuation_); + result.word_timestamps = parakeet::timestamp_utils::get_words_offsets( + result.token_timestamps, *tokenizer_, supported_punctuation_); + result.segment_timestamps = + parakeet::timestamp_utils::get_segment_offsets(result.word_timestamps); + } catch (const std::exception& e) { + ET_LOG(Error, "Failed to compute timestamps: %s", e.what()); + } + + return result; +} + +TranscriptionResult ParakeetRunner::transcribe(const std::string& audio_path) { + ET_LOG(Info, "Loading audio from: %s", audio_path.c_str()); + std::vector audio_data = + ::executorch::extension::llm::load_wav_audio_data(audio_path); + ET_LOG(Info, "Loaded %zu audio samples", audio_data.size()); + return transcribe(audio_data); +} + +std::vector ParakeetRunner::greedy_decode( const ::executorch::aten::Tensor& f_proj, int64_t encoder_len, - int64_t blank_id, - int64_t num_rnn_layers = 2, - int64_t pred_hidden = 640, - int64_t max_symbols_per_step = 10) { + TranscriptionResult::Timing& timing) { std::vector hypothesis; + const int64_t blank_id = config_.blank_id; + const int64_t num_rnn_layers = config_.num_rnn_layers; + const int64_t pred_hidden = config_.pred_hidden; + const int64_t max_symbols_per_step = 10; + // Shape: [1, time_steps, joint_hidden] auto f_proj_sizes = f_proj.sizes(); - int64_t time_steps = f_proj_sizes[1]; int64_t proj_dim = f_proj_sizes[2]; // Initialize LSTM state @@ -145,9 +410,14 @@ std::vector greedy_decode_executorch( std::vector sos_token_data = {blank_id}; auto sos_token = from_blob( sos_token_data.data(), {1, 1}, ::executorch::aten::ScalarType::Long); - auto decoder_init_result = model.execute( - "decoder_step", - std::vector<::executorch::runtime::EValue>{sos_token, h, c}); + + auto sos_start = std::chrono::high_resolution_clock::now(); + auto decoder_init_result = + model_->execute("decoder_step", std::vector{sos_token, h, c}); + auto sos_end = std::chrono::high_resolution_clock::now(); + timing.decoder_step_ms += + std::chrono::duration_cast(sos_end - sos_start) + .count(); if (!decoder_init_result.ok()) { ET_LOG(Error, "decoder_step (SOS) failed"); return hypothesis; @@ -175,7 +445,6 @@ std::vector greedy_decode_executorch( // Scan over encoder output while (t < encoder_len) { - // Get encoder frame at time t: f_proj[:, t:t+1, :] const float* f_proj_ptr = f_proj.const_data_ptr(); std::vector f_t_data(1 * 1 * proj_dim); @@ -192,8 +461,14 @@ std::vector greedy_decode_executorch( {1, 1, static_cast<::executorch::aten::SizesType>(proj_dim)}, ::executorch::aten::ScalarType::Float); - auto joint_result = model.execute( - "joint", std::vector<::executorch::runtime::EValue>{f_t, g_proj}); + auto joint_start = std::chrono::high_resolution_clock::now(); + auto joint_result = + model_->execute("joint", std::vector{f_t, g_proj}); + auto joint_end = std::chrono::high_resolution_clock::now(); + timing.joint_ms += std::chrono::duration_cast( + joint_end - joint_start) + .count(); + if (!joint_result.ok()) { ET_LOG(Error, "joint failed at t=%lld", static_cast(t)); return hypothesis; @@ -214,9 +489,15 @@ std::vector greedy_decode_executorch( auto token = from_blob( token_data.data(), {1, 1}, ::executorch::aten::ScalarType::Long); - auto decoder_result = model.execute( - "decoder_step", - std::vector<::executorch::runtime::EValue>{token, h, c}); + auto dec_start = std::chrono::high_resolution_clock::now(); + auto decoder_result = + model_->execute("decoder_step", std::vector{token, h, c}); + auto dec_end = std::chrono::high_resolution_clock::now(); + timing.decoder_step_ms += + std::chrono::duration_cast( + dec_end - dec_start) + .count(); + if (!decoder_result.ok()) { ET_LOG(Error, "decoder_step failed"); return hypothesis; @@ -226,7 +507,6 @@ std::vector greedy_decode_executorch( auto new_h = outputs[1].toTensor(); auto new_c = outputs[2].toTensor(); - // Update h, c, and g_proj std::memcpy( h_data.data(), new_h.const_data_ptr(), @@ -257,7 +537,9 @@ std::vector greedy_decode_executorch( return hypothesis; } -} // namespace +// ============================================================================ +// Main +// ============================================================================ int main(int argc, char** argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); @@ -275,180 +557,103 @@ int main(int argc, char** argv) { return 1; } - // Load model (which includes the bundled preprocessor) - ET_LOG(Info, "Loading model from: %s", FLAGS_model_path.c_str()); - std::unique_ptr model; - if (!FLAGS_data_path.empty()) { - ET_LOG(Info, "Loading data from: %s", FLAGS_data_path.c_str()); - model = std::make_unique( - FLAGS_model_path, FLAGS_data_path, Module::LoadMode::Mmap); - } else { - model = std::make_unique(FLAGS_model_path, Module::LoadMode::Mmap); - } - auto model_load_error = model->load(); - if (model_load_error != Error::Ok) { - ET_LOG(Error, "Failed to load model."); + // Load model and tokenizer (done once) + auto load_start = std::chrono::high_resolution_clock::now(); + auto runner = ParakeetRunner::load( + FLAGS_model_path, FLAGS_tokenizer_path, FLAGS_data_path); + auto load_end = std::chrono::high_resolution_clock::now(); + + if (!runner) { + ET_LOG(Error, "Failed to initialize ParakeetRunner."); return 1; } - // Load audio + auto load_ms = std::chrono::duration_cast( + load_end - load_start) + .count(); + ET_LOG(Info, "Load time: %lldms", static_cast(load_ms)); + + // Load audio once (not included in inference timing) ET_LOG(Info, "Loading audio from: %s", FLAGS_audio_path.c_str()); std::vector audio_data = ::executorch::extension::llm::load_wav_audio_data(FLAGS_audio_path); ET_LOG(Info, "Loaded %zu audio samples", audio_data.size()); - auto audio_tensor = from_blob( - audio_data.data(), - {static_cast<::executorch::aten::SizesType>(audio_data.size())}, - ::executorch::aten::ScalarType::Float); - std::vector audio_len_data = { - static_cast(audio_data.size())}; - auto audio_len_tensor = from_blob( - audio_len_data.data(), {1}, ::executorch::aten::ScalarType::Long); - - ET_LOG(Info, "Running preprocessor..."); - auto proc_result = model->execute( - "preprocessor", - std::vector<::executorch::runtime::EValue>{ - audio_tensor, audio_len_tensor}); - if (!proc_result.ok()) { - ET_LOG(Error, "Preprocessor forward failed."); - return 1; + // Run inference + TranscriptionResult result; + const int num_iterations = std::max(1, FLAGS_repeat); + const int num_warmup = std::max(1, FLAGS_warmup_repeat); + + // Warmup runs to initialize backends (e.g., Metal shader compilation). + // The first iteration includes backend initialization overhead. + // Note: Warmup can use any audio - it doesn't need to be the audio you + // want to transcribe. The purpose is to warm up the GPU/accelerator. + ET_LOG(Info, "Running %d warmup iteration(s)...", num_warmup); + auto warmup_start = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < num_warmup; ++i) { + result = runner->transcribe(audio_data); } - auto& proc_outputs = proc_result.get(); - auto mel = proc_outputs[0].toTensor(); - auto mel_len_tensor_out = proc_outputs[1].toTensor(); - int64_t mel_len_value = mel_len_tensor_out.const_data_ptr()[0]; - - // Create mel_len tensor for encoder - std::vector mel_len_data = {mel_len_value}; - auto mel_len = - from_blob(mel_len_data.data(), {1}, ::executorch::aten::ScalarType::Long); - + auto warmup_end = std::chrono::high_resolution_clock::now(); + auto warmup_ms = std::chrono::duration_cast( + warmup_end - warmup_start) + .count(); ET_LOG( Info, - "Mel spectrogram shape: [%ld, %ld, %ld], mel_len: %lld", - static_cast(mel.sizes()[0]), - static_cast(mel.sizes()[1]), - static_cast(mel.sizes()[2]), - static_cast(mel_len_value)); - - ET_LOG(Info, "Running encoder..."); - auto enc_result = model->execute( - "encoder", std::vector<::executorch::runtime::EValue>{mel, mel_len}); - if (!enc_result.ok()) { - ET_LOG(Error, "Encoder forward failed."); - return 1; + "Warmup time: %lldms (%d iteration(s), includes backend initialization)", + static_cast(warmup_ms), + num_warmup); + + // Timed inference iterations + ET_LOG(Info, "Running %d timed inference iteration(s)...", num_iterations); + auto infer_start = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < num_iterations; ++i) { + result = runner->transcribe(audio_data); + if (num_iterations > 1 && (i + 1) % 10 == 0) { + ET_LOG(Info, "Completed iteration %d/%d", i + 1, num_iterations); + } } - auto& enc_outputs = enc_result.get(); - auto f_proj = enc_outputs[0].toTensor(); // [B, T, joint_hidden] - int64_t encoded_len = enc_outputs[1].toTensor().const_data_ptr()[0]; + auto infer_end = std::chrono::high_resolution_clock::now(); - ET_LOG( - Info, - "Encoder output (f_proj) shape: [%ld, %ld, %ld], len=%ld", - static_cast(f_proj.sizes()[0]), - static_cast(f_proj.sizes()[1]), - static_cast(f_proj.sizes()[2]), - static_cast(encoded_len)); - - // Query model metadata from constant_methods - std::vector<::executorch::runtime::EValue> empty_inputs; - auto num_rnn_layers_result = model->execute("num_rnn_layers", empty_inputs); - auto pred_hidden_result = model->execute("pred_hidden", empty_inputs); - auto vocab_size_result = model->execute("vocab_size", empty_inputs); - auto blank_id_result = model->execute("blank_id", empty_inputs); - auto sample_rate_result = model->execute("sample_rate", empty_inputs); - auto window_stride_result = model->execute("window_stride", empty_inputs); - auto encoder_subsampling_factor_result = - model->execute("encoder_subsampling_factor", empty_inputs); + auto total_ms = std::chrono::duration_cast( + infer_end - infer_start) + .count(); + double avg_ms = static_cast(total_ms) / num_iterations; - if (!num_rnn_layers_result.ok() || !pred_hidden_result.ok() || - !vocab_size_result.ok() || !blank_id_result.ok() || - !sample_rate_result.ok() || !window_stride_result.ok() || - !encoder_subsampling_factor_result.ok()) { + if (num_iterations == 1) { + ET_LOG(Info, "Inference time: %lldms", static_cast(total_ms)); + } else { ET_LOG( - Error, - "Failed to query model metadata. Make sure the model was exported with constant_methods."); - return 1; + Info, + "Inference time: %lldms total, %.2fms avg (%d iterations)", + static_cast(total_ms), + avg_ms, + num_iterations); } - int64_t vocab_size = vocab_size_result.get()[0].toInt(); - int64_t blank_id = blank_id_result.get()[0].toInt(); - int64_t num_rnn_layers = num_rnn_layers_result.get()[0].toInt(); - int64_t pred_hidden = pred_hidden_result.get()[0].toInt(); - int64_t sample_rate = sample_rate_result.get()[0].toInt(); - double window_stride = window_stride_result.get()[0].toDouble(); - int64_t encoder_subsampling_factor = - encoder_subsampling_factor_result.get()[0].toInt(); - + // Log granular timing breakdown (from last iteration) ET_LOG( Info, - "Model metadata: vocab_size=%lld, blank_id=%lld, num_rnn_layers=%lld, pred_hidden=%lld, sample_rate=%lld, window_stride=%.6f, encoder_subsampling_factor=%lld", - static_cast(vocab_size), - static_cast(blank_id), - static_cast(num_rnn_layers), - static_cast(pred_hidden), - static_cast(sample_rate), - window_stride, - encoder_subsampling_factor); - - ET_LOG(Info, "Running TDT greedy decode..."); - auto decoded_tokens = greedy_decode_executorch( - *model, f_proj, encoded_len, blank_id, num_rnn_layers, pred_hidden); - - ET_LOG(Info, "Decoded %zu tokens", decoded_tokens.size()); - - // Load tokenizer - ET_LOG(Info, "Loading tokenizer from: %s", FLAGS_tokenizer_path.c_str()); - auto tokenizer = - ::executorch::extension::llm::load_tokenizer(FLAGS_tokenizer_path); - if (!tokenizer || !tokenizer->is_loaded()) { - ET_LOG( - Error, - "Failed to load tokenizer from: %s", - FLAGS_tokenizer_path.c_str()); - return 1; - } - - // Convert tokens to text - std::string text = parakeet::tokenizer_utils::decode_token_sequence( - decoded_tokens, *tokenizer); - std::cout << "Transcribed text: " << text << std::endl; + "Granular timing - Preprocessor: %lldms, Encoder: %lldms, Decoder: %lldms " + "(decoder_step: %lldms, joint: %lldms, loop_overhead: %lldms)", + static_cast(result.timing.preprocessor_ms), + static_cast(result.timing.encoder_ms), + static_cast(result.timing.decoder_total_ms), + static_cast(result.timing.decoder_step_ms), + static_cast(result.timing.joint_ms), + static_cast(result.timing.loop_overhead_ms())); + + // Output transcription + std::cout << "Transcribed text: " << result.text << std::endl; if (!timestamp_mode.enabled()) { return 0; } - ET_LOG(Info, "Computing timestamps..."); - std::unordered_set supported_punctuation = - parakeet::tokenizer_utils::derive_supported_punctuation(*tokenizer); - ET_LOG( - Info, - "Derived supported_punctuation size=%zu", - supported_punctuation.size()); - - // for simplicity, compute all levels of timestamps regardless of mode - std::vector tokens_with_text_info; - try { - tokens_with_text_info = - parakeet::timestamp_utils::get_tokens_with_text_info( - decoded_tokens, *tokenizer, supported_punctuation); - } catch (const std::exception& e) { - ET_LOG(Error, "Failed to get tokens with text info: %s", e.what()); - return 1; - } - const auto word_offsets = parakeet::timestamp_utils::get_words_offsets( - tokens_with_text_info, *tokenizer, supported_punctuation); - const auto segment_offsets = - parakeet::timestamp_utils::get_segment_offsets(word_offsets); - - const double frame_to_seconds = - window_stride * static_cast(encoder_subsampling_factor); + const double frame_to_seconds = runner->frame_to_seconds(); if (timestamp_mode.segment) { std::cout << "\nSegment timestamps:" << std::endl; - for (const auto& segment : segment_offsets) { + for (const auto& segment : result.segment_timestamps) { const double start = segment.start_offset * frame_to_seconds; const double end = segment.end_offset * frame_to_seconds; std::cout << start << "s - " << end << "s : " << segment.text @@ -458,7 +663,7 @@ int main(int argc, char** argv) { if (timestamp_mode.word) { std::cout << "\nWord timestamps:" << std::endl; - for (const auto& word : word_offsets) { + for (const auto& word : result.word_timestamps) { const double start = word.start_offset * frame_to_seconds; const double end = word.end_offset * frame_to_seconds; std::cout << start << "s - " << end << "s : " << word.text << std::endl; @@ -467,7 +672,7 @@ int main(int argc, char** argv) { if (timestamp_mode.token) { std::cout << "\nToken timestamps:" << std::endl; - for (const auto& token : tokens_with_text_info) { + for (const auto& token : result.token_timestamps) { const double start = token.start_offset * frame_to_seconds; const double end = token.end_offset * frame_to_seconds; std::cout << start << "s - " << end << "s : " << token.decoded_text