llama.cpp/common/jinja/string.cpp
Xuan-Son Nguyen c15395f73c
common : implement new jinja template engine (#18462)
* jinja vm

* lexer

* add vm types

* demo

* clean up

* parser ok

* binary_expression::execute

* shadow naming

* bin ops works!

* fix map object

* add string builtins

* add more builtins

* wip

* use mk_val

* eval with is_user_input

* render gemma tmpl ok

* track input string even after transformations

* support binded functions

* keyword arguments and slicing array

* use shared_ptr for values

* add mk_stmt

* allow print source on exception

* fix negate test

* testing more templates

* mostly works

* add filter_statement

* allow func to access ctx

* add jinja-value.cpp

* impl global_from_json

* a lot of fixes

* more tests

* more fix, more tests

* more fixes

* rm workarounds

* demo: type inferrence

* add placeholder for tojson

* improve function args handling

* rm type inference

* no more std::regex

* trailing spaces

* make testing more flexible

* make output a bit cleaner

* (wip) redirect minja calls

* test: add --output

* fix crash on macro kwargs

* add minimal caps system

* add some workarounds

* rm caps_apply_workarounds

* get rid of preprocessing

* more fixes

* fix test-chat-template

* move test-chat-jinja into test-chat-template

* rm test-chat-jinja from cmake

* test-chat-template: use common

* fix build

* fix build (2)

* rename vm --> interpreter

* improve error reporting

* correct lstrip behavior

* add tojson

* more fixes

* disable tests for COMMON_CHAT_FORMAT_GENERIC

* make sure tojson output correct order

* add object.length

* fully functional selectattr / rejectattr

* improve error reporting

* more builtins added, more fixes

* create jinja rendering tests

* fix testing.h path

* adjust whitespace rules

* more fixes

* temporary disable test for ibm-granite

* r/lstrip behavior matched with hf.js

* minimax, glm4.5 ok

* add append and pop

* kimi-k2 ok

* test-chat passed

* fix lstrip_block

* add more jinja tests

* cast to unsigned char

* allow dict key to be numeric

* nemotron: rm windows newline

* tests ok

* fix test

* rename interpreter --> runtime

* fix build

* add more checks

* bring back generic format support

* fix Apertus

* [json.exception.out_of_range.403] key 'content' not found

* rm generic test

* refactor input marking

* add docs

* fix windows build

* clarify error message

* improved tests

* split/rsplit with maxsplit

* non-inverse maxsplit

forgot to change after simplifying

* implement separators for tojson and fix indent

* i like to move it move it

* rename null -- > none

* token::eof

* some nits + comments

* add exception classes for lexer and parser

* null -> none

* rename global -> env

* rm minja

* update docs

* docs: add input marking caveats

* imlement missing jinja-tests functions

* oops

* support trim filter with args, remove bogus to_json reference

* numerous argument fixes

* updated tests

* implement optional strip chars parameter

* use new chars parameter

* float filter also has default

* always leave at least one decimal in float string

* jinja : static analysis + header cleanup + minor fixes

* add fuzz test

* add string.cpp

* fix chat_template_kwargs

* nits

* fix build

* revert

* unrevert

sorry :)

* add fuzz func_args, refactor to be safer

* fix array.map()

* loosen ensure_vals max count condition, add not impl for map(int)

* hopefully fix windows

* check if empty first

* normalize newlines

---------

Co-authored-by: Alde Rojas <hello@alde.dev>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-01-16 11:22:06 +01:00

208 lines
5.1 KiB
C++

#include "jinja/string.h"
#include "jinja/value.h"
#include <algorithm>
#include <functional>
#include <optional>
#include <sstream>
#include <string>
#include <vector>
namespace jinja {
//
// string_part
//
bool string_part::is_uppercase() const {
for (char c : val) {
if (std::islower(static_cast<unsigned char>(c))) {
return false;
}
}
return true;
}
bool string_part::is_lowercase() const {
for (char c : val) {
if (std::isupper(static_cast<unsigned char>(c))) {
return false;
}
}
return true;
}
//
// string
//
void string::mark_input() {
for (auto & part : parts) {
part.is_input = true;
}
}
std::string string::str() const {
if (parts.size() == 1) {
return parts[0].val;
}
std::ostringstream oss;
for (const auto & part : parts) {
oss << part.val;
}
return oss.str();
}
size_t string::length() const {
size_t len = 0;
for (const auto & part : parts) {
len += part.val.length();
}
return len;
}
bool string::all_parts_are_input() const {
for (const auto & part : parts) {
if (!part.is_input) {
return false;
}
}
return true;
}
bool string::is_uppercase() const {
for (const auto & part : parts) {
if (!part.is_uppercase()) {
return false;
}
}
return true;
}
bool string::is_lowercase() const {
for (const auto & part : parts) {
if (!part.is_lowercase()) {
return false;
}
}
return true;
}
// mark this string as input if other has ALL parts as input
void string::mark_input_based_on(const string & other) {
if (other.all_parts_are_input()) {
for (auto & part : parts) {
part.is_input = true;
}
}
}
string string::append(const string & other) {
for (const auto & part : other.parts) {
parts.push_back(part);
}
return *this;
}
// in-place transformation
using transform_fn = std::function<std::string(const std::string&)>;
static string apply_transform(string & self, const transform_fn & fn) {
for (auto & part : self.parts) {
part.val = fn(part.val);
}
return self;
}
string string::uppercase() {
return apply_transform(*this, [](const std::string & s) {
std::string res = s;
std::transform(res.begin(), res.end(), res.begin(), ::toupper);
return res;
});
}
string string::lowercase() {
return apply_transform(*this, [](const std::string & s) {
std::string res = s;
std::transform(res.begin(), res.end(), res.begin(), ::tolower);
return res;
});
}
string string::capitalize() {
return apply_transform(*this, [](const std::string & s) {
if (s.empty()) return s;
std::string res = s;
res[0] = ::toupper(static_cast<unsigned char>(res[0]));
std::transform(res.begin() + 1, res.end(), res.begin() + 1, ::tolower);
return res;
});
}
string string::titlecase() {
return apply_transform(*this, [](const std::string & s) {
std::string res = s;
bool capitalize_next = true;
for (char &c : res) {
if (isspace(static_cast<unsigned char>(c))) {
capitalize_next = true;
} else if (capitalize_next) {
c = ::toupper(static_cast<unsigned char>(c));
capitalize_next = false;
} else {
c = ::tolower(static_cast<unsigned char>(c));
}
}
return res;
});
}
string string::strip(bool left, bool right, std::optional<const std::string_view> chars) {
static auto strip_part = [](const std::string & s, bool left, bool right, std::optional<const std::string_view> chars) -> std::string {
size_t start = 0;
size_t end = s.length();
auto match_char = [&chars](unsigned char c) -> bool {
return chars ? (*chars).find(c) != std::string::npos : isspace(c);
};
if (left) {
while (start < end && match_char(static_cast<unsigned char>(s[start]))) {
++start;
}
}
if (right) {
while (end > start && match_char(static_cast<unsigned char>(s[end - 1]))) {
--end;
}
}
return s.substr(start, end - start);
};
if (parts.empty()) {
return *this;
}
if (left) {
for (size_t i = 0; i < parts.size(); ++i) {
parts[i].val = strip_part(parts[i].val, true, false, chars);
if (parts[i].val.empty()) {
// remove empty part
parts.erase(parts.begin() + i);
--i;
continue;
} else {
break;
}
}
}
if (right) {
for (size_t i = parts.size(); i-- > 0;) {
parts[i].val = strip_part(parts[i].val, false, true, chars);
if (parts[i].val.empty()) {
// remove empty part
parts.erase(parts.begin() + i);
continue;
} else {
break;
}
}
}
return *this;
}
} // namespace jinja