From c9cd49701a91cb10a07f07512d1bb5b966f05c6e Mon Sep 17 00:00:00 2001 From: leejet Date: Thu, 19 Feb 2026 20:54:42 +0800 Subject: [PATCH] fix: safely handle whitespace and consecutive newlines (#1288) --- src/tokenize_util.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/tokenize_util.cpp b/src/tokenize_util.cpp index bc0ff1d..22cf8ae 100644 --- a/src/tokenize_util.cpp +++ b/src/tokenize_util.cpp @@ -919,15 +919,21 @@ std::vector token_split(const std::string& text) { // `\s*[\r\n]+|\s+(?!\S)|\s+` if (is_space(cp)) { - std::string token = codepoint_to_utf8(cp); - ++i; + std::string token; + bool saw_new_line = false; while (i < cps.size() && is_space(cps[i])) { token += codepoint_to_utf8(cps[i]); - ++i; + if (cps[i] == U'\r' || cps[i] == U'\n') { - break; + saw_new_line = true; + } else { + if (saw_new_line) { + break; + } } + + ++i; } tokens.push_back(token);