Source code for hexrec.hexdump

# Copyright (c) 2013-2025, Andrea Zoppi
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
#    this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

r"""Emulation of the hexdump utility."""

import io
import os
import sys
from typing import IO
from typing import Callable
from typing import List
from typing import Mapping
from typing import Sequence
from typing import Union  # NOTE: type | operator unsupported for Python < 3.10
from typing import cast as _cast

from bytesparse.base import ImmutableMemory

from .base import ByteString
from .utils import SparseMemoryIO

CHAR_PRINTABLE: Sequence[bytes] = [b.to_bytes(1, 'big') for b in (
    b'................'
    b'................'
    b' !"#$%&\'()*+,-./'
    b'0123456789:;<=>?'
    b'@ABCDEFGHIJKLMNO'
    b'PQRSTUVWXYZ[\\]^_'
    b'`abcdefghijklmno'
    b'pqrstuvwxyz{|}~.'
    b'................'
    b'................'
    b'................'
    b'................'
    b'................'
    b'................'
    b'................'
    b'................'
    b' ><'
)]
r"""Printable characters lookup table."""

CHAR_TOKENS: Sequence[bytes] = [
    b'  \\0', b' 001', b' 002', b' 003', b' 004', b' 005', b' 006', b'  \\a',
    b'  \\b', b'  \\t', b'  \\n', b'  \\v', b'  \\f', b'  \\r', b' 016', b' 017',
    b' 020', b' 021', b' 022', b' 023', b' 024', b' 025', b' 026', b' 027',
    b' 030', b' 031', b' 032', b' 033', b' 034', b' 035', b' 036', b' 037',
    b'    ', b'   !', b'   "', b'   #', b'   $', b'   %', b'   &', b"   '",
    b'   (', b'   )', b'   *', b'   +', b'   ,', b'   -', b'   .', b'   /',
    b'   0', b'   1', b'   2', b'   3', b'   4', b'   5', b'   6', b'   7',
    b'   8', b'   9', b'   :', b'   ;', b'   <', b'   =', b'   >', b'   ?',
    b'   @', b'   A', b'   B', b'   C', b'   D', b'   E', b'   F', b'   G',
    b'   H', b'   I', b'   J', b'   K', b'   L', b'   M', b'   N', b'   O',
    b'   P', b'   Q', b'   R', b'   S', b'   T', b'   U', b'   V', b'   W',
    b'   X', b'   Y', b'   Z', b'   [', b'   \\', b'   ]', b'   ^', b'   _',
    b'   `', b'   a', b'   b', b'   c', b'   d', b'   e', b'   f', b'   g',
    b'   h', b'   i', b'   j', b'   k', b'   l', b'   m', b'   n', b'   o',
    b'   p', b'   q', b'   r', b'   s', b'   t', b'   u', b'   v', b'   w',
    b'   x', b'   y', b'   z', b'   {', b'   |', b'   }', b'   ~', b' 177',
    b' 200', b' 201', b' 202', b' 203', b' 204', b' 205', b' 206', b' 207',
    b' 210', b' 211', b' 212', b' 213', b' 214', b' 215', b' 216', b' 217',
    b' 220', b' 221', b' 222', b' 223', b' 224', b' 225', b' 226', b' 227',
    b' 230', b' 231', b' 232', b' 233', b' 234', b' 235', b' 236', b' 237',
    b' 240', b' 241', b' 242', b' 243', b' 244', b' 245', b' 246', b' 247',
    b' 250', b' 251', b' 252', b' 253', b' 254', b' 255', b' 256', b' 257',
    b' 260', b' 261', b' 262', b' 263', b' 264', b' 265', b' 266', b' 267',
    b' 270', b' 271', b' 272', b' 273', b' 274', b' 275', b' 276', b' 277',
    b' 300', b' 301', b' 302', b' 303', b' 304', b' 305', b' 306', b' 307',
    b' 310', b' 311', b' 312', b' 313', b' 314', b' 315', b' 316', b' 317',
    b' 320', b' 321', b' 322', b' 323', b' 324', b' 325', b' 326', b' 327',
    b' 330', b' 331', b' 332', b' 333', b' 334', b' 335', b' 336', b' 337',
    b' 340', b' 341', b' 342', b' 343', b' 344', b' 345', b' 346', b' 347',
    b' 350', b' 351', b' 352', b' 353', b' 354', b' 355', b' 356', b' 357',
    b' 360', b' 361', b' 362', b' 363', b' 364', b' 365', b' 366', b' 367',
    b' 370', b' 371', b' 372', b' 373', b' 374', b' 375', b' 376', b' 377',
    b' ---', b' >>>', b' <<<'
]
r"""Character tokens lookup table."""

_HEX_LOWER = [b'%02x' % b for b in range(256)] + [b'--', b'>>', b'<<']
_HEX_UPPER = [b'%02X' % b for b in range(256)] + [b'--', b'>>', b'<<']

_HEX_LOWER_TOKENS = [b' %02x' % b for b in range(256)] + [b' --', b' >>', b' <<']
_HEX_UPPER_TOKENS = [b' %02X' % b for b in range(256)] + [b' --', b' >>', b' <<']

_OCTAL_TOKENS = [b' %03o' % b for b in range(256)] + [b' ---', b' >>>', b' <<<']

DEFAULT_FORMAT_ORDER: Sequence[str] = [
    'one_byte_octal',
    'one_byte_hex',
    'one_byte_char',
    'canonical',
    'two_bytes_decimal',
    'two_bytes_octal',
    'two_bytes_hex',
]
r"""Default order of display options."""


def _format_default(
    address: int,
    chunk: ByteString,
    width: int,
    upper: bool,
) -> List[bytes]:

    address_fmt = b'%07X' if upper else b'%07x'
    tokens = [address_fmt % address]

    table = _HEX_UPPER if upper else _HEX_LOWER
    size = len(chunk)
    tokens.extend((b' ' + table[chunk[offset+1]] + table[chunk[offset]])
                  for offset in range(0, size-1, 2))

    if size & 1:
        tokens.append(b' 00' + table[chunk[size-1]])

    if size < width:
        tokens.extend(b'     ' for _ in range(1, width - size, 2))

    return tokens


def _format_one_byte_octal(
    address: int,
    chunk: ByteString,
    width: int,
    upper: bool,
) -> List[bytes]:

    address_fmt = b'%07X' if upper else b'%07x'
    tokens = [address_fmt % address]

    table = _OCTAL_TOKENS
    tokens.extend(table[b] for b in chunk)

    size = len(chunk)
    if size < width:
        tokens.extend(b'    ' for _ in range(width - size))

    return tokens


def _format_one_byte_hex(
    address: int,
    chunk: ByteString,
    width: int,
    upper: bool,
) -> List[bytes]:

    address_fmt = b'%08X' if upper else b'%08x'
    tokens = [address_fmt % address]

    table = _HEX_UPPER_TOKENS if upper else _HEX_LOWER_TOKENS
    tokens.extend(table[b] for b in chunk)

    size = len(chunk)
    if size < width:
        tokens.extend(b'   ' for _ in range(width - size))

    return tokens


def _format_one_byte_char(
    address: int,
    chunk: ByteString,
    width: int,
    upper: bool,
) -> List[bytes]:

    address_fmt = b'%07X' if upper else b'%07x'
    tokens = [address_fmt % address]

    table = CHAR_TOKENS
    tokens.extend(table[b] for b in chunk)

    size = len(chunk)
    if size < width:
        tokens.extend(b'    ' for _ in range(width - size))

    return tokens


def _format_canonical(
    address: int,
    chunk: ByteString,
    width: int,
    upper: bool,
) -> List[bytes]:

    address_fmt = b'%08X' if upper else b'%08x'
    tokens = [address_fmt % address]

    table = _HEX_UPPER_TOKENS if upper else _HEX_LOWER_TOKENS
    size = len(chunk)
    offset = 0
    append = tokens.append

    for offset in range(size):
        if (offset & 7) == 0:
            append(b' ')
        append(table[chunk[offset]])

    for offset in range(offset + 1, width):
        if (offset & 7) == 0:
            append(b' ')
        append(b'   ')

    table = CHAR_PRINTABLE
    append(b'  |')
    tokens.extend(table[b] for b in chunk)
    append(b'|')

    return tokens


def _format_two_bytes_decimal(
    address: int,
    chunk: ByteString,
    width: int,
    upper: bool,
) -> List[bytes]:

    address_fmt = b'%07X' if upper else b'%07x'
    tokens = [address_fmt % address]

    size = len(chunk)
    tokens.extend(b'   %05d' % (chunk[offset] | (chunk[offset+1] << 8))
                  for offset in range(0, size-1, 2))

    if size & 1:
        tokens.append(b'   %05d' % chunk[size-1])

    if size < width:
        tokens.extend(b'        ' for _ in range(1, width - size, 2))

    return tokens


def _format_two_bytes_octal(
    address: int,
    chunk: ByteString,
    width: int,
    upper: bool,
) -> List[bytes]:

    address_fmt = b'%07X' if upper else b'%07x'
    tokens = [address_fmt % address]

    del upper
    size = len(chunk)
    tokens.extend(b'  %06o' % (chunk[offset] | (chunk[offset+1] << 8))
                  for offset in range(0, size-1, 2))

    if size & 1:
        tokens.append(b'  %06o' % chunk[size-1])

    if size < width:
        tokens.extend(b'        ' for _ in range(1, width - size, 2))

    return tokens


def _format_two_bytes_hex(
    address: int,
    chunk: ByteString,
    width: int,
    upper: bool,
) -> List[bytes]:

    address_fmt = b'%07X' if upper else b'%07x'
    tokens = [address_fmt % address]

    table = _HEX_UPPER if upper else _HEX_LOWER
    size = len(chunk)
    tokens.extend((b'    ' + table[chunk[offset+1]] + table[chunk[offset]])
                  for offset in range(0, size-1, 2))

    if size & 1:
        tokens.append(b'    00' + table[chunk[size-1]])

    if size < width:
        tokens.extend(b'        ' for _ in range(1, width - size, 2))

    return tokens


_FormatHandler = Callable[[int, ByteString, int, bool], List[bytes]]

_FORMAT_HANDLERS: Mapping[str, _FormatHandler] = {
    'default': _format_default,
    'one_byte_octal': _format_one_byte_octal,
    'one_byte_hex': _format_one_byte_hex,
    'one_byte_char': _format_one_byte_char,
    'canonical': _format_canonical,
    'two_bytes_decimal': _format_two_bytes_decimal,
    'two_bytes_octal': _format_two_bytes_octal,
    'two_bytes_hex': _format_two_bytes_hex,
}

_ADDRESS_FMT: Mapping[str, bytes] = {
    'default': b'%07x',
    'one_byte_octal': b'%07x',
    'one_byte_hex': b'%08x',
    'one_byte_char': b'%07x',
    'canonical': b'%08x',
    'two_bytes_decimal': b'%07x',
    'two_bytes_octal': b'%07x',
    'two_bytes_hex': b'%07x',
}


# noinspection PyShadowingBuiltins

[docs]
def hexdump_core(
    infile: Union[str, ByteString, IO, None] = None,
    outfile: Union[str, ByteString, IO, None] = None,
    one_byte_octal: bool = False,
    one_byte_hex: bool = False,
    one_byte_char: bool = False,
    canonical: bool = False,
    two_bytes_decimal: bool = False,
    two_bytes_octal: bool = False,
    two_bytes_hex: bool = False,
    color: Union[str, None] = None,
    format: Union[str, None] = None,
    format_file: Union[str, None] = None,
    length: Union[int, None] = None,
    skip: Union[int, None] = None,
    no_squeezing: bool = False,
    upper: bool = False,
    width: int = 16,
    linesep: Union[ByteString, None] = None,
    format_order: Union[Sequence[str], None] = None,
) -> IO:
    r"""Emulation of the `hexdump` utility core.

    Args:
        infile (str or bytes):
            Input data.
            If :obj:`str`, it is considered as the input file path.
            If :obj:`bytes`, it is the input byte chunk.
            If ``None``, it reads from the standard input.

        outfile (str or bytes):
            Output data.
            If :obj:`str`, it is considered as the output file path.
            If :obj:`bytes`, it is the output byte chunk.
            If ``None``, it writes to the standard output.

        one_byte_octal (bool):
            One-byte octal display. Display the input offset in
            hexadecimal, followed by sixteen space-separated,
            three-column, zero-filled bytes of input data, in octal, per
            line.

        one_byte_hex (bool):
            One-byte hexadecimal display. Display the input offset in
            hexadecimal, followed by sixteen space-separated, two-column,
            zero-filled bytes of input data, in hexadecimal, per line.

        one_byte_char (bool):
            One-byte character display. Display the input offset in
            hexadecimal, followed by sixteen space-separated,
            three-column, space-filled characters of input data per line.

        canonical (bool):
            Canonical hex+ASCII display. Display the input offset in
            hexadecimal, followed by sixteen space-separated, two-column,
            hexadecimal bytes, followed by the same sixteen bytes in %_p
            format enclosed in | characters. Invoking the program as hd
            implies this option.

        two_bytes_decimal (bool):
            Two-byte decimal display. Display the input offset in
            hexadecimal, followed by eight space-separated, five-column,
            zero-filled, two-byte units of input data, in unsigned
            decimal, per line.

        two_bytes_octal (bool):
            Two-byte octal display. Display the input offset in
            hexadecimal, followed by eight space-separated, six-column,
            zero-filled, two-byte quantities of input data, in octal, per
            line.

        two_bytes_hex (bool):
            Two-byte hexadecimal display. Display the input offset in
            hexadecimal, followed by eight space-separated, four-column,
            zero-filled, two-byte quantities of input data, in
            hexadecimal, per line.

        color (str):
            *CURRENTLY NOT SUPPORTED*. Please provide ``None``.

        format (str):
            *CURRENTLY NOT SUPPORTED*. Please provide ``None``.

        format_file (str):
            *CURRENTLY NOT SUPPORTED*. Please provide ``None``.

        length (int):
            Interpret only length bytes of input.

        skip (int):
            Skip offset bytes from the beginning of the input.

        no_squeezing (bool):
            The -v option causes hexdump to display all input data.
            Without the -v option, any number of groups of output lines
            which would be identical to the immediately preceding group
            of output lines (except for the input offsets), are replaced
            with a line comprised of a single asterisk.

        upper (bool):
            Uses upper case hex letters on address and data.

        width (int):
            Number of bytes per line.

        linesep (bytes):
            Line separator bytes.

        format_order (list of str):
            If not ``None``, it indicates the order of display options
            (``one_byte_octal``, ``one_byte_hex``, ``one_byte_char``,
            ``canonical``, ``two_bytes_decimal``, ``two_bytes_octal``,
            ``two_bytes_hex``).
            Duplicates are allowed.
            Only those with the corresponding boolean argument true are used.

    Returns:
        stream: The handle to the output stream.
    """

    if color is not None:
        raise NotImplementedError('"color" option is not supported')
    if format is not None:
        raise NotImplementedError('"format" option is not supported')
    if format_file is not None:
        raise NotImplementedError('"format_file" option is not supported')

    if skip is not None:
        skip = skip.__index__()
        if skip < 0:
            raise ValueError('negative skip')

    if length is not None:
        length = length.__index__()
        if length < 0:
            raise ValueError('negative length')

    width = width.__index__()
    width_min = 2 if two_bytes_decimal or two_bytes_octal or two_bytes_hex else 1
    if width < width_min:
        raise ValueError('invalid width')

    if linesep is None:
        linesep = os.linesep.encode()

    format_flags = {
        'one_byte_octal': one_byte_octal,
        'one_byte_hex': one_byte_hex,
        'one_byte_char': one_byte_char,
        'canonical': canonical,
        'two_bytes_decimal': two_bytes_decimal,
        'two_bytes_octal': two_bytes_octal,
        'two_bytes_hex': two_bytes_hex,
    }
    if format_order is None:
        format_order = DEFAULT_FORMAT_ORDER
    else:
        for format_name in format_order:
            if format_name not in format_flags:
                raise ValueError(f'unknown format option: {format_name!r}')

    format_handlers = [_FORMAT_HANDLERS[format_name]
                       for format_name in format_order
                       if format_flags[format_name]]
    if not format_handlers:
        format_order = ['default']
        format_handlers = [_format_default]

    do_squeezing = not no_squeezing
    instream: Union[IO, SparseMemoryIO, None] = None
    outstream: Union[IO, None] = None
    try:
        # Input stream binding
        if infile is None:
            infile = None
            instream = sys.stdin.buffer
        elif isinstance(infile, str):
            instream = open(infile, 'rb')
        elif isinstance(infile, (bytes, bytearray, memoryview)):
            instream = io.BytesIO(infile)
        elif isinstance(infile, ImmutableMemory):
            instream = SparseMemoryIO(memory=infile)
            if skip is None:
                skip = infile.start
                skip -= skip % width
        else:
            instream = _cast(IO, infile)
        assert instream is not None

        # Output stream binding
        if outfile is None:
            outfile = None
            outstream = sys.stdout.buffer
        elif isinstance(outfile, str):
            outstream = open(outfile, 'wb')
        else:
            outstream = _cast(IO, outfile)
        assert outstream is not None

        if skip:
            instream.seek(skip, io.SEEK_CUR)
        else:
            skip = 0

        offset = 0
        last_chunk = None
        squeezing = False
        read = instream.read
        write = outstream.write

        while True:
            if length is None:
                chunk = read(width)
            else:
                chunk = read(min(width, length - offset))
            chunk = _cast(ByteString, chunk)

            if not chunk:
                break

            if do_squeezing and chunk == last_chunk:
                if not squeezing:
                    write(b'*')
                    write(linesep)
                    squeezing = True
            else:
                squeezing = False
                address = skip + offset

                for format_handler in format_handlers:
                    tokens = format_handler(address, chunk, width, upper)
                    tokens.append(linesep)
                    line = b''.join(tokens)
                    write(line)

            last_chunk = chunk
            offset += len(chunk)

        address_fmt = _ADDRESS_FMT[format_order[-1]]
        if upper:
            address_fmt = address_fmt.upper()
        write(address_fmt % (skip + offset))
        write(linesep)

    finally:
        if instream is not None and isinstance(infile, str):
            instream.close()

        if outstream is not None and isinstance(outfile, str):
            outstream.close()

    assert outstream is not None
    return outstream