Skip to content

gguf-py: byteswapping improvements #12851

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 67 additions & 63 deletions gguf-py/gguf/scripts/gguf_convert_endian.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,61 @@
logger = logging.getLogger("gguf-convert-endian")


def byteswap_q4_0(tensor, block_offs):
# Each block_q4_0 consists of an f16 delta (scaling factor) followed by 16 int8 quantizations.

# Byte-Swap f16 sized delta field
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
delta.byteswap(inplace=True)


def byteswap_q8_0(tensor, block_offs):
# Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations.

# Byte-Swap f16 sized delta field
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
delta.byteswap(inplace=True)


def byteswap_q4_k(tensor, block_offs):
# Each block_q4_k consists of 2 f16 values followed by 140 int8 values.

# Byte-Swap f16 sized fields
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
delta.byteswap(inplace=True)

delta = tensor.data[block_offs + 2:block_offs + 4].view(dtype=np.uint16)
delta.byteswap(inplace=True)


def byteswap_q6_k(tensor, block_offs):
# Each block_q6_k consists of 208 int8 values followed by 1 f16 value.

# Byte-Swap f16 sized field
delta = tensor.data[block_offs + 208:block_offs + 210].view(dtype=np.uint16)
delta.byteswap(inplace=True)


byteswap_tensors = {
gguf.GGMLQuantizationType.Q4_0: {
"block_size": 18, # 18 bytes = <f16 delta scaling factor> + 16 * <int8 quant>
"byteswap_func": byteswap_q4_0,
},
gguf.GGMLQuantizationType.Q8_0: {
"block_size": 34, # 34 bytes = <f16 delta scaling factor> + 32 * <int8 quant>
"byteswap_func": byteswap_q8_0,
},
gguf.GGMLQuantizationType.Q4_K: {
"block_size": 144, # 144 bytes = 2 * <f16 delta scaling factor> + 140 * <int8 quant>
"byteswap_func": byteswap_q4_k,
},
gguf.GGMLQuantizationType.Q6_K: {
"block_size": 210, # 210 bytes = <f16 delta scaling factor> + 208 * <int8 quant>
"byteswap_func": byteswap_q6_k,
},
}


def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None:
file_endian = reader.endianess.name
if reader.byte_order == 'S':
Expand All @@ -32,13 +87,11 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
sys.exit(0)
logger.info("* Checking tensors for conversion compatibility")
for tensor in reader.tensors:
if tensor.tensor_type not in (
gguf.GGMLQuantizationType.F32,
gguf.GGMLQuantizationType.F16,
gguf.GGMLQuantizationType.Q8_0,
gguf.GGMLQuantizationType.Q4_K,
gguf.GGMLQuantizationType.Q6_K,
):
if tensor.tensor_type not in byteswap_tensors and \
tensor.tensor_type not in (
gguf.GGMLQuantizationType.F32,
gguf.GGMLQuantizationType.F16,
):
raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}")
logger.info(f"* Preparing to convert from {file_endian} to {order}")
if args.dry_run:
Expand Down Expand Up @@ -72,78 +125,29 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
part.byteswap(inplace=True)

# Byte-swap tensor data if necessary
if tensor.tensor_type == gguf.GGMLQuantizationType.Q8_0:
# Handle Q8_0 tensor blocks (block_q8_0)
# Specific handling of block_q8_0 is required.
# Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations.

block_size = 34 # 34 bytes = <f16 delta scaling factor> + 32 * <int8 quant>

n_blocks = len(tensor.data) // block_size
for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
block_offs = block_num * block_size

# Byte-Swap f16 sized delta field
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
delta.byteswap(inplace=True)

# Byte-Swap Q8 weights
if block_num % 100000 == 0:
inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")

elif tensor.tensor_type == gguf.GGMLQuantizationType.Q4_K:
# Handle Q4_K tensor blocks (block_q4_k)
# Specific handling of block_q4_k is required.
# Each block_q4_k consists of 2 f16 values followed by 140 int8 values.

if tensor.tensor_type in byteswap_tensors:
# first flatten structure
oldshape = tensor.data.shape
newshape = 1
for i in tensor.data.shape:
newshape *= i

tensor.data.resize(newshape)

block_size = 144
n_blocks = len(tensor.data) // block_size
for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
block_offs = block_num * block_size

# Byte-Swap f16 sized fields
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
delta.byteswap(inplace=True)

delta = tensor.data[block_offs + 2:block_offs + 4].view(dtype=np.uint16)
delta.byteswap(inplace=True)

# Byte-Swap
if block_num % 100000 == 0:
inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")

elif tensor.tensor_type == gguf.GGMLQuantizationType.Q6_K:
# Handle Q6_K tensor blocks (block_q6_k)
# Specific handling of block_q6_k is required.
# Each block_q6_k consists of 208 int8 values followed by 1 f16 value.

# first flatten structure
newshape = 1
for i in tensor.data.shape:
newshape *= i

tensor.data.resize(newshape)
block_size = byteswap_tensors[tensor.tensor_type]["block_size"]
byteswap_func = byteswap_tensors[tensor.tensor_type]["byteswap_func"]

block_size = 210
n_blocks = len(tensor.data) // block_size
for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
block_offs = block_num * block_size

# Byte-Swap f16 sized field
delta = tensor.data[block_offs + 208:block_offs + 210].view(dtype=np.uint16)
delta.byteswap(inplace=True)
byteswap_func(tensor, block_offs)

# Byte-Swap
if block_num % 100000 == 0:
inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")

# restore old shape in case it's ever used
tensor.data.resize(oldshape)
else:
# Handle other tensor types
tensor.data.byteswap(inplace=True)
Expand Down