mtmd: refactor mtmd_decode_use_mrope (#22161)

2026-04-23 20:12:00 +02:00 · 2026-04-20 14:45:11 +02:00 · 2026-04-20 14:45:11 +02:00 · a678916623
commit a678916623
parent 81df3f7cfa
1 changed files with 5 additions and 15 deletions
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@ -131,6 +131,7 @@ struct mtmd_context {
    int n_threads;
    std::string media_marker;
    const int n_embd_text;
+    llama_rope_type decoder_rope;

    // these are not token, but strings used to mark the beginning and end of image/audio embeddings
    std::string img_beg;
@ -167,7 +168,8 @@ struct mtmd_context {
        print_timings(ctx_params.print_timings),
        n_threads    (ctx_params.n_threads),
        media_marker (ctx_params.media_marker),
-        n_embd_text  (llama_model_n_embd_inp(text_model))
+        n_embd_text  (llama_model_n_embd_inp(text_model)),
+        decoder_rope (llama_model_rope_type(text_model))
    {
        if (ctx_params.image_marker != nullptr) {
            throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
@ -1029,20 +1031,8 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx, const mtmd_input_chunk * chu
 }

 bool mtmd_decode_use_mrope(mtmd_context * ctx) {
-    if (ctx->ctx_v == nullptr && ctx->proj_type_a() == PROJECTOR_TYPE_QWEN3A) {
-        // qwen3-asr
-        return true;
-    }
-    switch (ctx->proj_type_v()) {
-        case PROJECTOR_TYPE_QWEN2VL:
-        case PROJECTOR_TYPE_QWEN25VL:
-        case PROJECTOR_TYPE_QWEN3VL:
-        case PROJECTOR_TYPE_GLM4V:
-        case PROJECTOR_TYPE_PADDLEOCR:
-            return true;
-        default:
-            return false;
-    }
+    return ctx->decoder_rope == LLAMA_ROPE_TYPE_MROPE
+        || ctx->decoder_rope == LLAMA_ROPE_TYPE_IMROPE;
 }

 bool mtmd_support_vision(mtmd_context * ctx) {