mirror of
https://github.com/ggerganov/llama.cpp
synced 2026-04-29 10:41:41 +02:00
convert : use F32 for dequant of pack-quantized tensors
This commit is contained in:
parent
3770d9410d
commit
128118fdbe
@ -364,7 +364,7 @@ class ModelBase:
|
||||
unpacked = unpacked.reshape(shape[0], (unpacked.shape[-1] + group_size - 1) // group_size, group_size)
|
||||
unpacked = unpacked - offset
|
||||
|
||||
return (unpacked * scale.unsqueeze(-1)).reshape(shape)
|
||||
return (unpacked * scale.unsqueeze(-1).float()).reshape(shape)
|
||||
|
||||
if quant_method == "bitnet":
|
||||
for name in self.model_tensors.keys():
|
||||
|
||||
Loading…
Reference in New Issue
Block a user