mirror of
https://github.com/ggerganov/llama.cpp
synced 2026-03-06 15:19:40 +01:00
* implement sleeping at queue level * implement server-context suspend * add test * add docs * optimization: add fast path * make sure to free llama_init * nits * fix use-after-free * allow /models to be accessed during sleeping, fix use-after-free * don't allow accessing /models during sleep, it is not thread-safe * fix data race on accessing props and model_meta * small clean up * trailing whitespace * rm outdated comments
40 lines
1000 B
Python
40 lines
1000 B
Python
import pytest
|
|
import time
|
|
from utils import *
|
|
|
|
server = ServerPreset.tinyllama2()
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def create_server():
|
|
global server
|
|
server = ServerPreset.tinyllama2()
|
|
|
|
|
|
def test_server_sleep():
|
|
global server
|
|
server.sleep_idle_seconds = 1
|
|
server.start()
|
|
|
|
# wait a bit so that server can go to sleep
|
|
time.sleep(2)
|
|
|
|
# make sure these endpoints are still responsive after sleep
|
|
res = server.make_request("GET", "/health")
|
|
assert res.status_code == 200
|
|
res = server.make_request("GET", "/props")
|
|
assert res.status_code == 200
|
|
assert res.body["is_sleeping"] == True
|
|
|
|
# make a generation request to wake up the server
|
|
res = server.make_request("POST", "/completion", data={
|
|
"n_predict": 1,
|
|
"prompt": "Hello",
|
|
})
|
|
assert res.status_code == 200
|
|
|
|
# it should no longer be sleeping
|
|
res = server.make_request("GET", "/props")
|
|
assert res.status_code == 200
|
|
assert res.body["is_sleeping"] == False
|