# Copyright (c) 2013-2024, Andrea Zoppi
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
r"""Emulation of the hexdump utility."""
import io
import os
import sys
from typing import IO
from typing import Callable
from typing import List
from typing import Mapping
from typing import Optional
from typing import Sequence
from typing import Union
from bytesparse.base import ImmutableMemory
from .base import AnyBytes
from .utils import SparseMemoryIO
CHAR_PRINTABLE: Sequence[bytes] = [b.to_bytes(1, 'big') for b in (
b'................'
b'................'
b' !"#$%&\'()*+,-./'
b'0123456789:;<=>?'
b'@ABCDEFGHIJKLMNO'
b'PQRSTUVWXYZ[\\]^_'
b'`abcdefghijklmno'
b'pqrstuvwxyz{|}~.'
b'................'
b'................'
b'................'
b'................'
b'................'
b'................'
b'................'
b'................'
b' ><'
)]
r"""Printable characters lookup table."""
CHAR_TOKENS: Sequence[bytes] = [
b' \\0', b' 001', b' 002', b' 003', b' 004', b' 005', b' 006', b' \\a',
b' \\b', b' \\t', b' \\n', b' \\v', b' \\f', b' \\r', b' 016', b' 017',
b' 020', b' 021', b' 022', b' 023', b' 024', b' 025', b' 026', b' 027',
b' 030', b' 031', b' 032', b' 033', b' 034', b' 035', b' 036', b' 037',
b' ', b' !', b' "', b' #', b' $', b' %', b' &', b" '",
b' (', b' )', b' *', b' +', b' ,', b' -', b' .', b' /',
b' 0', b' 1', b' 2', b' 3', b' 4', b' 5', b' 6', b' 7',
b' 8', b' 9', b' :', b' ;', b' <', b' =', b' >', b' ?',
b' @', b' A', b' B', b' C', b' D', b' E', b' F', b' G',
b' H', b' I', b' J', b' K', b' L', b' M', b' N', b' O',
b' P', b' Q', b' R', b' S', b' T', b' U', b' V', b' W',
b' X', b' Y', b' Z', b' [', b' \\', b' ]', b' ^', b' _',
b' `', b' a', b' b', b' c', b' d', b' e', b' f', b' g',
b' h', b' i', b' j', b' k', b' l', b' m', b' n', b' o',
b' p', b' q', b' r', b' s', b' t', b' u', b' v', b' w',
b' x', b' y', b' z', b' {', b' |', b' }', b' ~', b' 177',
b' 200', b' 201', b' 202', b' 203', b' 204', b' 205', b' 206', b' 207',
b' 210', b' 211', b' 212', b' 213', b' 214', b' 215', b' 216', b' 217',
b' 220', b' 221', b' 222', b' 223', b' 224', b' 225', b' 226', b' 227',
b' 230', b' 231', b' 232', b' 233', b' 234', b' 235', b' 236', b' 237',
b' 240', b' 241', b' 242', b' 243', b' 244', b' 245', b' 246', b' 247',
b' 250', b' 251', b' 252', b' 253', b' 254', b' 255', b' 256', b' 257',
b' 260', b' 261', b' 262', b' 263', b' 264', b' 265', b' 266', b' 267',
b' 270', b' 271', b' 272', b' 273', b' 274', b' 275', b' 276', b' 277',
b' 300', b' 301', b' 302', b' 303', b' 304', b' 305', b' 306', b' 307',
b' 310', b' 311', b' 312', b' 313', b' 314', b' 315', b' 316', b' 317',
b' 320', b' 321', b' 322', b' 323', b' 324', b' 325', b' 326', b' 327',
b' 330', b' 331', b' 332', b' 333', b' 334', b' 335', b' 336', b' 337',
b' 340', b' 341', b' 342', b' 343', b' 344', b' 345', b' 346', b' 347',
b' 350', b' 351', b' 352', b' 353', b' 354', b' 355', b' 356', b' 357',
b' 360', b' 361', b' 362', b' 363', b' 364', b' 365', b' 366', b' 367',
b' 370', b' 371', b' 372', b' 373', b' 374', b' 375', b' 376', b' 377',
b' ---', b' >>>', b' <<<'
]
r"""Character tokens lookup table."""
_HEX_LOWER = [b'%02x' % b for b in range(256)] + [b'--', b'>>', b'<<']
_HEX_UPPER = [b'%02X' % b for b in range(256)] + [b'--', b'>>', b'<<']
_HEX_LOWER_TOKENS = [b' %02x' % b for b in range(256)] + [b' --', b' >>', b' <<']
_HEX_UPPER_TOKENS = [b' %02X' % b for b in range(256)] + [b' --', b' >>', b' <<']
_OCTAL_TOKENS = [b' %03o' % b for b in range(256)] + [b' ---', b' >>>', b' <<<']
DEFAULT_FORMAT_ORDER: Sequence[str] = [
'one_byte_octal',
'one_byte_hex',
'one_byte_char',
'canonical',
'two_bytes_decimal',
'two_bytes_octal',
'two_bytes_hex',
]
r"""Default order of display options."""
def _format_default(
address: int,
chunk: AnyBytes,
width: int,
upper: bool,
) -> List[bytes]:
address_fmt = b'%07X' if upper else b'%07x'
tokens = [address_fmt % address]
table = _HEX_UPPER if upper else _HEX_LOWER
size = len(chunk)
tokens.extend((b' ' + table[chunk[offset+1]] + table[chunk[offset]])
for offset in range(0, size-1, 2))
if size & 1:
tokens.append(b' 00' + table[chunk[size-1]])
if size < width:
tokens.extend(b' ' for _ in range(1, width - size, 2))
return tokens
def _format_one_byte_octal(
address: int,
chunk: AnyBytes,
width: int,
upper: bool,
) -> List[bytes]:
address_fmt = b'%07X' if upper else b'%07x'
tokens = [address_fmt % address]
table = _OCTAL_TOKENS
tokens.extend(table[b] for b in chunk)
size = len(chunk)
if size < width:
tokens.extend(b' ' for _ in range(width - size))
return tokens
def _format_one_byte_hex(
address: int,
chunk: AnyBytes,
width: int,
upper: bool,
) -> List[bytes]:
address_fmt = b'%08X' if upper else b'%08x'
tokens = [address_fmt % address]
table = _HEX_UPPER_TOKENS if upper else _HEX_LOWER_TOKENS
tokens.extend(table[b] for b in chunk)
size = len(chunk)
if size < width:
tokens.extend(b' ' for _ in range(width - size))
return tokens
def _format_one_byte_char(
address: int,
chunk: AnyBytes,
width: int,
upper: bool,
) -> List[bytes]:
address_fmt = b'%07X' if upper else b'%07x'
tokens = [address_fmt % address]
table = CHAR_TOKENS
tokens.extend(table[b] for b in chunk)
size = len(chunk)
if size < width:
tokens.extend(b' ' for _ in range(width - size))
return tokens
def _format_canonical(
address: int,
chunk: AnyBytes,
width: int,
upper: bool,
) -> List[bytes]:
address_fmt = b'%08X' if upper else b'%08x'
tokens = [address_fmt % address]
table = _HEX_UPPER_TOKENS if upper else _HEX_LOWER_TOKENS
size = len(chunk)
offset = 0
append = tokens.append
for offset in range(size):
if (offset & 7) == 0:
append(b' ')
append(table[chunk[offset]])
for offset in range(offset + 1, width):
if (offset & 7) == 0:
append(b' ')
append(b' ')
table = CHAR_PRINTABLE
append(b' |')
tokens.extend(table[b] for b in chunk)
append(b'|')
return tokens
def _format_two_bytes_decimal(
address: int,
chunk: AnyBytes,
width: int,
upper: bool,
) -> List[bytes]:
address_fmt = b'%07X' if upper else b'%07x'
tokens = [address_fmt % address]
size = len(chunk)
tokens.extend(b' %05d' % (chunk[offset] | (chunk[offset+1] << 8))
for offset in range(0, size-1, 2))
if size & 1:
tokens.append(b' %05d' % chunk[size-1])
if size < width:
tokens.extend(b' ' for _ in range(1, width - size, 2))
return tokens
def _format_two_bytes_octal(
address: int,
chunk: AnyBytes,
width: int,
upper: bool,
) -> List[bytes]:
address_fmt = b'%07X' if upper else b'%07x'
tokens = [address_fmt % address]
del upper
size = len(chunk)
tokens.extend(b' %06o' % (chunk[offset] | (chunk[offset+1] << 8))
for offset in range(0, size-1, 2))
if size & 1:
tokens.append(b' %06o' % chunk[size-1])
if size < width:
tokens.extend(b' ' for _ in range(1, width - size, 2))
return tokens
def _format_two_bytes_hex(
address: int,
chunk: AnyBytes,
width: int,
upper: bool,
) -> List[bytes]:
address_fmt = b'%07X' if upper else b'%07x'
tokens = [address_fmt % address]
table = _HEX_UPPER if upper else _HEX_LOWER
size = len(chunk)
tokens.extend((b' ' + table[chunk[offset+1]] + table[chunk[offset]])
for offset in range(0, size-1, 2))
if size & 1:
tokens.append(b' 00' + table[chunk[size-1]])
if size < width:
tokens.extend(b' ' for _ in range(1, width - size, 2))
return tokens
_FormatHandler = Callable[[int, AnyBytes, int, bool], List[bytes]]
_FORMAT_HANDLERS: Mapping[str, _FormatHandler] = {
'default': _format_default,
'one_byte_octal': _format_one_byte_octal,
'one_byte_hex': _format_one_byte_hex,
'one_byte_char': _format_one_byte_char,
'canonical': _format_canonical,
'two_bytes_decimal': _format_two_bytes_decimal,
'two_bytes_octal': _format_two_bytes_octal,
'two_bytes_hex': _format_two_bytes_hex,
}
_ADDRESS_FMT: Mapping[str, bytes] = {
'default': b'%07x',
'one_byte_octal': b'%07x',
'one_byte_hex': b'%08x',
'one_byte_char': b'%07x',
'canonical': b'%08x',
'two_bytes_decimal': b'%07x',
'two_bytes_octal': b'%07x',
'two_bytes_hex': b'%07x',
}
# noinspection PyShadowingBuiltins
[docs]
def hexdump_core(
infile: Optional[Union[str, AnyBytes, IO]] = None,
outfile: Optional[Union[str, AnyBytes, IO]] = None,
one_byte_octal: bool = False,
one_byte_hex: bool = False,
one_byte_char: bool = False,
canonical: bool = False,
two_bytes_decimal: bool = False,
two_bytes_octal: bool = False,
two_bytes_hex: bool = False,
color: Optional[str] = None,
format: Optional[str] = None,
format_file: Optional[str] = None,
length: Optional[int] = None,
skip: Optional[int] = None,
no_squeezing: bool = False,
upper: bool = False,
width: int = 16,
linesep: Optional[AnyBytes] = None,
format_order: Optional[Sequence[str]] = None,
) -> IO:
r"""Emulation of the `hexdump` utility core.
Args:
infile (str or bytes):
Input data.
If :obj:`str`, it is considered as the input file path.
If :obj:`bytes`, it is the input byte chunk.
If ``None``, it reads from the standard input.
outfile (str or bytes):
Output data.
If :obj:`str`, it is considered as the output file path.
If :obj:`bytes`, it is the output byte chunk.
If ``None``, it writes to the standard output.
one_byte_octal (bool):
One-byte octal display. Display the input offset in
hexadecimal, followed by sixteen space-separated,
three-column, zero-filled bytes of input data, in octal, per
line.
one_byte_hex (bool):
One-byte hexadecimal display. Display the input offset in
hexadecimal, followed by sixteen space-separated, two-column,
zero-filled bytes of input data, in hexadecimal, per line.
one_byte_char (bool):
One-byte character display. Display the input offset in
hexadecimal, followed by sixteen space-separated,
three-column, space-filled characters of input data per line.
canonical (bool):
Canonical hex+ASCII display. Display the input offset in
hexadecimal, followed by sixteen space-separated, two-column,
hexadecimal bytes, followed by the same sixteen bytes in %_p
format enclosed in | characters. Invoking the program as hd
implies this option.
two_bytes_decimal (bool):
Two-byte decimal display. Display the input offset in
hexadecimal, followed by eight space-separated, five-column,
zero-filled, two-byte units of input data, in unsigned
decimal, per line.
two_bytes_octal (bool):
Two-byte octal display. Display the input offset in
hexadecimal, followed by eight space-separated, six-column,
zero-filled, two-byte quantities of input data, in octal, per
line.
two_bytes_hex (bool):
Two-byte hexadecimal display. Display the input offset in
hexadecimal, followed by eight space-separated, four-column,
zero-filled, two-byte quantities of input data, in
hexadecimal, per line.
color (str):
*CURRENTLY NOT SUPPORTED*. Please provide ``None``.
format (str):
*CURRENTLY NOT SUPPORTED*. Please provide ``None``.
format_file (str):
*CURRENTLY NOT SUPPORTED*. Please provide ``None``.
length (int):
Interpret only length bytes of input.
skip (int):
Skip offset bytes from the beginning of the input.
no_squeezing (bool):
The -v option causes hexdump to display all input data.
Without the -v option, any number of groups of output lines
which would be identical to the immediately preceding group
of output lines (except for the input offsets), are replaced
with a line comprised of a single asterisk.
upper (bool):
Uses upper case hex letters on address and data.
width (int):
Number of bytes per line.
linesep (bytes):
Line separator bytes.
format_order (list of str):
If not ``None``, it indicates the order of display options
(``one_byte_octal``, ``one_byte_hex``, ``one_byte_char``,
``canonical``, ``two_bytes_decimal``, ``two_bytes_octal``,
``two_bytes_hex``).
Duplicates are allowed.
Only those with the corresponding boolean argument true are used.
Returns:
stream: The handle to the output stream.
"""
if color is not None:
raise NotImplementedError('"color" option is not supported')
if format is not None:
raise NotImplementedError('"format" option is not supported')
if format_file is not None:
raise NotImplementedError('"format_file" option is not supported')
skip = 0 if skip is None else skip.__index__()
if skip < 0:
raise ValueError('negative skip')
if length is not None:
length = length.__index__()
if length < 0:
raise ValueError('negative length')
width = width.__index__()
width_min = 2 if two_bytes_decimal or two_bytes_octal or two_bytes_hex else 1
if width < width_min:
raise ValueError('invalid width')
if linesep is None:
linesep = os.linesep.encode()
format_flags = {
'one_byte_octal': one_byte_octal,
'one_byte_hex': one_byte_hex,
'one_byte_char': one_byte_char,
'canonical': canonical,
'two_bytes_decimal': two_bytes_decimal,
'two_bytes_octal': two_bytes_octal,
'two_bytes_hex': two_bytes_hex,
}
if format_order is None:
format_order = DEFAULT_FORMAT_ORDER
else:
for format_name in format_order:
if format_name not in format_flags:
raise ValueError(f'unknown format option: {format_name!r}')
format_handlers = [_FORMAT_HANDLERS[format_name]
for format_name in format_order
if format_flags[format_name]]
if not format_handlers:
format_order = ['default']
format_handlers = [_format_default]
do_squeezing = not no_squeezing
instream: Optional[IO, SparseMemoryIO] = None
outstream: Optional[IO] = None
try:
# Input stream binding
if infile is None:
infile = None
instream = sys.stdin.buffer
elif isinstance(infile, str):
instream = open(infile, 'rb')
elif isinstance(infile, (bytes, bytearray, memoryview)):
instream = io.BytesIO(infile)
elif isinstance(infile, ImmutableMemory):
instream = SparseMemoryIO(memory=infile)
else:
instream = infile
# Output stream binding
if outfile is None:
outfile = None
outstream = sys.stdout.buffer
elif isinstance(outfile, str):
outstream = open(outfile, 'wb')
else:
outstream = outfile
if skip:
instream.seek(skip, io.SEEK_CUR)
offset = 0
last_chunk = None
squeezing = False
read = instream.read
write = outstream.write
while True:
if length is None:
chunk = read(width)
else:
chunk = read(min(width, length - offset))
if not chunk:
break
if do_squeezing and chunk == last_chunk:
if not squeezing:
write(b'*')
write(linesep)
squeezing = True
else:
squeezing = False
address = skip + offset
for format_handler in format_handlers:
tokens = format_handler(address, chunk, width, upper)
tokens.append(linesep)
line = b''.join(tokens)
write(line)
last_chunk = chunk
offset += len(chunk)
address_fmt = _ADDRESS_FMT[format_order[-1]]
if upper:
address_fmt = address_fmt.upper()
write(address_fmt % (skip + offset))
write(linesep)
finally:
if instream is not None and isinstance(infile, str):
instream.close()
if outstream is not None and isinstance(outfile, str):
outstream.close()
return outstream