llama.cpp/tools/server/server-context.h
Xuan-Son Nguyen 6c2131773c
cli: new CLI experience (#17824)
* wip

* wip

* fix logging, add display info

* handle commands

* add args

* wip

* move old cli to llama-completion

* rm deprecation notice

* move server to a shared library

* move ci to llama-completion

* add loading animation

* add --show-timings arg

* add /read command, improve LOG_ERR

* add args for speculative decoding, enable show timings by default

* add arg --image and --audio

* fix windows build

* support reasoning_content

* fix llama2c workflow

* color default is auto

* fix merge conflicts

* properly fix color problem

Co-authored-by: bandoti <bandoti@users.noreply.github.com>

* better loading spinner

* make sure to clean color on force-exit

* also clear input files on "/clear"

* simplify common_log_flush

* add warning in mtmd-cli

* implement console writter

* fix data race

* add attribute

* fix llama-completion and mtmd-cli

* add some notes about console::log

* fix compilation

---------

Co-authored-by: bandoti <bandoti@users.noreply.github.com>
2025-12-10 15:28:59 +01:00

94 lines
3.2 KiB
C++

#include "server-http.h"
#include "server-task.h"
#include "server-queue.h"
#include <nlohmann/json_fwd.hpp>
#include <cstddef>
#include <memory>
struct server_context_impl; // private implementation
struct server_context_info {
std::string build_info;
std::string model_name;
bool has_inp_image;
bool has_inp_audio;
};
struct server_context {
std::unique_ptr<server_context_impl> impl;
server_context();
~server_context();
// initialize slots and server-related data
void init();
// load the model and initialize llama_context
// returns true on success
bool load_model(const common_params & params);
// this function will block main thread until termination
void start_loop();
// terminate main loop (will unblock start_loop)
void terminate();
// get the underlaying llama_context
llama_context * get_llama_context() const;
// get a new response reader, used by CLI application
server_response_reader get_response_reader();
// get server info
// used by CLI application
server_context_info get_info() const;
};
// forward declarations
struct server_res_generator;
struct server_routes {
server_routes(const common_params & params, server_context & ctx_server, std::function<bool()> is_ready = []() { return true; })
: params(params), ctx_server(*ctx_server.impl), is_ready(is_ready) {
init_routes();
}
void init_routes();
// handlers using lambda function, so that they can capture `this` without `std::bind`
server_http_context::handler_t get_health;
server_http_context::handler_t get_metrics;
server_http_context::handler_t get_slots;
server_http_context::handler_t post_slots;
server_http_context::handler_t get_props;
server_http_context::handler_t post_props;
server_http_context::handler_t get_api_show;
server_http_context::handler_t post_infill;
server_http_context::handler_t post_completions;
server_http_context::handler_t post_completions_oai;
server_http_context::handler_t post_chat_completions;
server_http_context::handler_t post_anthropic_messages;
server_http_context::handler_t post_anthropic_count_tokens;
server_http_context::handler_t post_apply_template;
server_http_context::handler_t get_models;
server_http_context::handler_t post_tokenize;
server_http_context::handler_t post_detokenize;
server_http_context::handler_t post_embeddings;
server_http_context::handler_t post_embeddings_oai;
server_http_context::handler_t post_rerank;
server_http_context::handler_t get_lora_adapters;
server_http_context::handler_t post_lora_adapters;
private:
// TODO: move these outside of server_routes?
std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot);
std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type);
const common_params & params;
server_context_impl & ctx_server;
std::function<bool()> is_ready;
};