mirror of
https://github.com/ggerganov/llama.cpp
synced 2026-03-11 17:51:30 +01:00
* jinja vm * lexer * add vm types * demo * clean up * parser ok * binary_expression::execute * shadow naming * bin ops works! * fix map object * add string builtins * add more builtins * wip * use mk_val * eval with is_user_input * render gemma tmpl ok * track input string even after transformations * support binded functions * keyword arguments and slicing array * use shared_ptr for values * add mk_stmt * allow print source on exception * fix negate test * testing more templates * mostly works * add filter_statement * allow func to access ctx * add jinja-value.cpp * impl global_from_json * a lot of fixes * more tests * more fix, more tests * more fixes * rm workarounds * demo: type inferrence * add placeholder for tojson * improve function args handling * rm type inference * no more std::regex * trailing spaces * make testing more flexible * make output a bit cleaner * (wip) redirect minja calls * test: add --output * fix crash on macro kwargs * add minimal caps system * add some workarounds * rm caps_apply_workarounds * get rid of preprocessing * more fixes * fix test-chat-template * move test-chat-jinja into test-chat-template * rm test-chat-jinja from cmake * test-chat-template: use common * fix build * fix build (2) * rename vm --> interpreter * improve error reporting * correct lstrip behavior * add tojson * more fixes * disable tests for COMMON_CHAT_FORMAT_GENERIC * make sure tojson output correct order * add object.length * fully functional selectattr / rejectattr * improve error reporting * more builtins added, more fixes * create jinja rendering tests * fix testing.h path * adjust whitespace rules * more fixes * temporary disable test for ibm-granite * r/lstrip behavior matched with hf.js * minimax, glm4.5 ok * add append and pop * kimi-k2 ok * test-chat passed * fix lstrip_block * add more jinja tests * cast to unsigned char * allow dict key to be numeric * nemotron: rm windows newline * tests ok * fix test * rename interpreter --> runtime * fix build * add more checks * bring back generic format support * fix Apertus * [json.exception.out_of_range.403] key 'content' not found * rm generic test * refactor input marking * add docs * fix windows build * clarify error message * improved tests * split/rsplit with maxsplit * non-inverse maxsplit forgot to change after simplifying * implement separators for tojson and fix indent * i like to move it move it * rename null -- > none * token::eof * some nits + comments * add exception classes for lexer and parser * null -> none * rename global -> env * rm minja * update docs * docs: add input marking caveats * imlement missing jinja-tests functions * oops * support trim filter with args, remove bogus to_json reference * numerous argument fixes * updated tests * implement optional strip chars parameter * use new chars parameter * float filter also has default * always leave at least one decimal in float string * jinja : static analysis + header cleanup + minor fixes * add fuzz test * add string.cpp * fix chat_template_kwargs * nits * fix build * revert * unrevert sorry :) * add fuzz func_args, refactor to be safer * fix array.map() * loosen ensure_vals max count condition, add not impl for map(int) * hopefully fix windows * check if empty first * normalize newlines --------- Co-authored-by: Alde Rojas <hello@alde.dev> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
158 lines
5.2 KiB
C++
158 lines
5.2 KiB
C++
#pragma once
|
|
|
|
#include "utils.h"
|
|
|
|
#include <cctype>
|
|
#include <map>
|
|
#include <stdexcept>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
namespace jinja {
|
|
|
|
struct token {
|
|
enum type {
|
|
eof, // end of source
|
|
text, // The text between Jinja statements or expressions
|
|
|
|
numeric_literal, // e.g., 123, 1.0
|
|
string_literal, // 'string'
|
|
identifier, // Variables, functions, statements, booleans, etc.
|
|
equals, // =
|
|
open_paren, // (
|
|
close_paren, // )
|
|
open_statement, // {%
|
|
close_statement, // %}
|
|
open_expression, // {{
|
|
close_expression, // }}
|
|
open_square_bracket, // [
|
|
close_square_bracket, // ]
|
|
open_curly_bracket, // {
|
|
close_curly_bracket, // }
|
|
comma, // ,
|
|
dot, // .
|
|
colon, // :
|
|
pipe, // |
|
|
|
|
call_operator, // ()
|
|
additive_binary_operator, // + - ~
|
|
multiplicative_binary_operator, // * / %
|
|
comparison_binary_operator, // < > <= >= == !=
|
|
unary_operator, // ! - +
|
|
comment, // {# ... #}
|
|
};
|
|
type t;
|
|
std::string value;
|
|
size_t pos;
|
|
};
|
|
|
|
static std::string type_to_string(token::type t) {
|
|
switch (t) {
|
|
case token::eof: return "eof";
|
|
case token::text: return "text";
|
|
case token::numeric_literal: return "numeric_literal";
|
|
case token::string_literal: return "string_literal";
|
|
case token::identifier: return "identifier";
|
|
case token::equals: return "equals";
|
|
case token::open_paren: return "open_paren";
|
|
case token::close_paren: return "close_paren";
|
|
case token::open_statement: return "open_statement";
|
|
case token::close_statement: return "close_statement";
|
|
case token::open_expression: return "open_expression";
|
|
case token::close_expression: return "close_expression";
|
|
case token::open_square_bracket: return "open_square_bracket";
|
|
case token::close_square_bracket: return "close_square_bracket";
|
|
case token::open_curly_bracket: return "open_curly_bracket";
|
|
case token::close_curly_bracket: return "close_curly_bracket";
|
|
case token::comma: return "comma";
|
|
case token::dot: return "dot";
|
|
case token::colon: return "colon";
|
|
case token::pipe: return "pipe";
|
|
case token::call_operator: return "call_operator";
|
|
case token::additive_binary_operator: return "additive_binary_operator";
|
|
case token::multiplicative_binary_operator: return "multiplicative_binary_operator";
|
|
case token::comparison_binary_operator: return "comparison_binary_operator";
|
|
case token::unary_operator: return "unary_operator";
|
|
case token::comment: return "comment";
|
|
default: return "unknown";
|
|
}
|
|
}
|
|
|
|
struct lexer_result {
|
|
std::vector<token> tokens;
|
|
std::string source;
|
|
};
|
|
|
|
struct lexer {
|
|
const std::map<char, char> escape_chars = {
|
|
{'n', '\n'},
|
|
{'t', '\t'},
|
|
{'r', '\r'},
|
|
{'b', '\b'},
|
|
{'f', '\f'},
|
|
{'v', '\v'},
|
|
{'\\', '\\'},
|
|
{'\'', '\''},
|
|
{'\"', '\"'},
|
|
};
|
|
|
|
static bool is_word(char c) {
|
|
return std::isalnum(static_cast<unsigned char>(c)) || c == '_';
|
|
}
|
|
|
|
static bool is_integer(char c) {
|
|
return std::isdigit(static_cast<unsigned char>(c));
|
|
}
|
|
|
|
const std::vector<std::pair<std::string, token::type>> ordered_mapping_table = {
|
|
// Trimmed control sequences
|
|
{"{%-", token::open_statement},
|
|
{"-%}", token::close_statement},
|
|
{"{{-", token::open_expression},
|
|
{"-}}", token::close_expression},
|
|
// Control sequences
|
|
{"{%", token::open_statement},
|
|
{"%}", token::close_statement},
|
|
{"{{", token::open_expression},
|
|
{"}}", token::close_expression},
|
|
// Single character tokens
|
|
{"(", token::open_paren},
|
|
{")", token::close_paren},
|
|
{"{", token::open_curly_bracket},
|
|
{"}", token::close_curly_bracket},
|
|
{"[", token::open_square_bracket},
|
|
{"]", token::close_square_bracket},
|
|
{",", token::comma},
|
|
{".", token::dot},
|
|
{":", token::colon},
|
|
{"|", token::pipe},
|
|
// Comparison operators
|
|
{"<=", token::comparison_binary_operator},
|
|
{">=", token::comparison_binary_operator},
|
|
{"==", token::comparison_binary_operator},
|
|
{"!=", token::comparison_binary_operator},
|
|
{"<", token::comparison_binary_operator},
|
|
{">", token::comparison_binary_operator},
|
|
// Arithmetic operators
|
|
{"+", token::additive_binary_operator},
|
|
{"-", token::additive_binary_operator},
|
|
{"~", token::additive_binary_operator},
|
|
{"*", token::multiplicative_binary_operator},
|
|
{"/", token::multiplicative_binary_operator},
|
|
{"%", token::multiplicative_binary_operator},
|
|
// Assignment operator
|
|
{"=", token::equals},
|
|
};
|
|
|
|
// tokenize the source string into a list of tokens
|
|
// may throw lexer_exception on error
|
|
lexer_result tokenize(const std::string & source);
|
|
};
|
|
|
|
struct lexer_exception : public std::runtime_error {
|
|
lexer_exception(const std::string & msg, const std::string & source, size_t pos)
|
|
: std::runtime_error(fmt_error_with_source("lexer", msg, source, pos)) {}
|
|
};
|
|
|
|
} // namespace jinja
|