ggml/examples/python/example_test_all_quants.py

from ggml import ffi, lib
from ggml.utils import init, numpy, copy
import numpy as np
from math import pi, cos, sin, ceil

import matplotlib.pyplot as plt

ctx = init(mem_size=100*1024*1024) # Will be auto-GC'd
n = 256

orig = np.array([
    [
        cos(j * 2 * pi / n) * (sin(i * 2 * pi / n))
        for j in range(n)
    ]
    for i in range(n)
], np.float32)
orig_tensor = lib.ggml_new_tensor_2d(ctx, lib.GGML_TYPE_F32, n, n)
copy(orig, orig_tensor)

quants = [
    type for type in range(lib.GGML_TYPE_COUNT)
    if lib.ggml_is_quantized(type) and
       type not in [lib.GGML_TYPE_Q8_1, lib.GGML_TYPE_Q8_K] # Apparently not supported
]
# quants = [lib.GGML_TYPE_Q2_K] # Test a single one

def get_name(type):
    name = lib.ggml_type_name(type)
    return ffi.string(name).decode('utf-8') if name else '?'

quants.sort(key=get_name)
quants.insert(0, None)
print(quants)

ncols=4
nrows = ceil(len(quants) / ncols)

plt.figure(figsize=(ncols * 5, nrows * 5), layout='tight')

for i, type in enumerate(quants):
    plt.subplot(nrows, ncols, i + 1)
    try:
        if type == None:
            plt.title('Original')
            plt.imshow(orig)
        else:
            quantized_tensor = lib.ggml_new_tensor_2d(ctx, type, n, n)
            copy(orig_tensor, quantized_tensor)
            quantized = numpy(quantized_tensor, allow_copy=True)
            d = quantized - orig
            results = {
                "l2": np.linalg.norm(d, 2),
                "linf": np.linalg.norm(d, np.inf),
                "compression":
                    round(lib.ggml_nbytes(orig_tensor) /
                          lib.ggml_nbytes(quantized_tensor), 1)
            }
            name = get_name(type)
            print(f'{name}: {results}')

            plt.title(f'{name} ({results["compression"]}x smaller)')
            plt.imshow(quantized, interpolation='nearest')

    except Exception as e:
        print(f'Error: {e}')

plt.show()