From a678916623ddef89c2a43776df24e00a52b17638 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 20 Apr 2026 14:45:11 +0200 Subject: [PATCH] mtmd: refactor mtmd_decode_use_mrope (#22161) --- tools/mtmd/mtmd.cpp | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 52fca4e81b..35b4bba77b 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -131,6 +131,7 @@ struct mtmd_context { int n_threads; std::string media_marker; const int n_embd_text; + llama_rope_type decoder_rope; // these are not token, but strings used to mark the beginning and end of image/audio embeddings std::string img_beg; @@ -167,7 +168,8 @@ struct mtmd_context { print_timings(ctx_params.print_timings), n_threads (ctx_params.n_threads), media_marker (ctx_params.media_marker), - n_embd_text (llama_model_n_embd_inp(text_model)) + n_embd_text (llama_model_n_embd_inp(text_model)), + decoder_rope (llama_model_rope_type(text_model)) { if (ctx_params.image_marker != nullptr) { throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead"); @@ -1029,20 +1031,8 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx, const mtmd_input_chunk * chu } bool mtmd_decode_use_mrope(mtmd_context * ctx) { - if (ctx->ctx_v == nullptr && ctx->proj_type_a() == PROJECTOR_TYPE_QWEN3A) { - // qwen3-asr - return true; - } - switch (ctx->proj_type_v()) { - case PROJECTOR_TYPE_QWEN2VL: - case PROJECTOR_TYPE_QWEN25VL: - case PROJECTOR_TYPE_QWEN3VL: - case PROJECTOR_TYPE_GLM4V: - case PROJECTOR_TYPE_PADDLEOCR: - return true; - default: - return false; - } + return ctx->decoder_rope == LLAMA_ROPE_TYPE_MROPE + || ctx->decoder_rope == LLAMA_ROPE_TYPE_IMROPE; } bool mtmd_support_vision(mtmd_context * ctx) {