Skip to content

Utility Functions

The following are main utility functions to create archive.

dump

This function is used to write the object to the file.

Parameters:

Name Type Description Default
file str | BytesIO

a string representing the file path

required
obj

the object to be written to the file

required
kwargs

additional keyword arguments to be passed to the LazyWriter

{}

Returns:

Type Description

None

Source code in src/msglc/__init__.py
Python
def dump(file: str | BytesIO, obj, **kwargs):
    """
    This function is used to write the object to the file.

    :param file: a string representing the file path
    :param obj: the object to be written to the file
    :param kwargs: additional keyword arguments to be passed to the `LazyWriter`
    :return: None
    """
    with LazyWriter(file, **kwargs) as msglc_writer:
        msglc_writer.write(obj)

combine

This function is used to combine the multiple serialized files into a single archive.

Parameters:

Name Type Description Default
archive str | BytesIO

a string representing the file path of the archive

required
files FileInfo | list[FileInfo]

a list of FileInfo objects

required
mode Literal['a', 'w']

a string representing the combination mode, 'w' for write and 'a' for append

'w'
validate bool

switch on to validate the files before combining

True

Returns:

Type Description

None

Source code in src/msglc/__init__.py
Python
def combine(
    archive: str | BytesIO,
    files: FileInfo | list[FileInfo],
    *,
    mode: Literal["a", "w"] = "w",
    validate: bool = True,
):
    """
    This function is used to combine the multiple serialized files into a single archive.

    :param archive: a string representing the file path of the archive
    :param files: a list of FileInfo objects
    :param mode: a string representing the combination mode, 'w' for write and 'a' for append
    :param validate: switch on to validate the files before combining
    :return: None
    """
    if isinstance(files, FileInfo):
        files = [files]

    if 0 < sum(1 for file in files if file.name is not None) < len(files):
        raise ValueError("Files must either all have names or all not have names.")

    if len(all_names := {file.name for file in files}) != len(files) and (
        len(all_names) != 1 or all_names.pop() is not None
    ):
        raise ValueError("Files must have unique names.")

    def _validate(_fp):
        if isinstance(_fp, str):
            if not os.path.exists(_fp):
                raise ValueError(f"File {_fp} does not exist.")
            with open(_fp, "rb") as _file:
                if _file.read(LazyWriter.magic_len()) != LazyWriter.magic:
                    raise ValueError(f"Invalid file format: {_fp}.")
        else:
            ini_pos = _fp.tell()
            magic = _fp.read(LazyWriter.magic_len())
            _fp.seek(ini_pos)
            if magic != LazyWriter.magic:
                raise ValueError("Invalid file format.")

    if validate:
        for file in files:
            _validate(file.path)

    def _iter(path: str | BinaryIO):
        if isinstance(path, str):
            with open(path, "rb") as _file:
                while _data := _file.read(config.copy_chunk_size):
                    yield _data
        else:
            while _data := path.read(config.copy_chunk_size):
                yield _data

    with LazyCombiner(archive, mode=mode) as combiner:
        for file in files:
            combiner.write(_iter(file.path), file.name)

append

This function is used to append the multiple serialized files to an existing single archive.

Parameters:

Name Type Description Default
archive str | BytesIO

a string representing the file path of the archive

required
files FileInfo | list[FileInfo]

a list of FileInfo objects

required
validate bool

switch on to validate the files before combining

True

Returns:

Type Description

None

Source code in src/msglc/__init__.py
Python
def append(
    archive: str | BytesIO, files: FileInfo | list[FileInfo], *, validate: bool = True
):
    """
    This function is used to append the multiple serialized files to an existing single archive.

    :param archive: a string representing the file path of the archive
    :param files: a list of FileInfo objects
    :param validate: switch on to validate the files before combining
    :return: None
    """
    combine(archive, files, mode="a", validate=validate)

configure

This function is used to configure the settings. It accepts any number of keyword arguments. The function updates the values of the configuration parameters if they are provided in the arguments.

Parameters:

Name Type Description Default
small_obj_optimization_threshold int | None

The threshold (in bytes) for small object optimization. Objects smaller than this threshold are not indexed.

None
write_buffer_size int | None

The size (in bytes) for the write buffer.

None
read_buffer_size int | None

The size (in bytes) for the read buffer.

None
fast_loading bool | None

Flag to enable or disable fast loading. If enabled, the container will be read in one go, instead of reading each child separately.

None
fast_loading_threshold int | float | None

The threshold (0 to 1) for fast loading. With the fast loading flag turned on, fast loading will be performed if the number of already read children over the total number of children is smaller than this threshold.

None
trivial_size int | None

The size (in bytes) considered trivial, around a dozen bytes. Objects smaller than this size are considered trivial. For a list of trivial objects, the container will be indexed in a blocked fashion.

None
disable_gc bool | None

Flag to enable or disable garbage collection.

None
simple_repr bool | None

Flag to enable or disable simple representation used in the repr method. If turned on, repr will not incur any disk I/O.

None
copy_chunk_size int | None

The size (in bytes) for the copy chunk.

None
numpy_encoder bool | None

Flag to enable or disable the numpy support. If enabled, the numpy arrays will be encoded using the dumps method provided by numpy. The arrays are stored as binary data directly. If disabled, the numpy arrays will be converted to lists before encoding.

None
magic bytes | None

Magic bytes (max length: 30) to set, used to identify the file format version.

None
Source code in src/msglc/config.py
Python
def configure(
    *,
    small_obj_optimization_threshold: int | None = None,
    write_buffer_size: int | None = None,
    read_buffer_size: int | None = None,
    fast_loading: bool | None = None,
    fast_loading_threshold: int | float | None = None,
    trivial_size: int | None = None,
    disable_gc: bool | None = None,
    simple_repr: bool | None = None,
    copy_chunk_size: int | None = None,
    numpy_encoder: bool | None = None,
    magic: bytes | None = None,
):
    """
    This function is used to configure the settings. It accepts any number of keyword arguments.
    The function updates the values of the configuration parameters if they are provided in the arguments.

    :param small_obj_optimization_threshold:
            The threshold (in bytes) for small object optimization.
            Objects smaller than this threshold are not indexed.
    :param write_buffer_size:
            The size (in bytes) for the write buffer.
    :param read_buffer_size:
            The size (in bytes) for the read buffer.
    :param fast_loading:
            Flag to enable or disable fast loading.
            If enabled, the container will be read in one go, instead of reading each child separately.
    :param fast_loading_threshold:
            The threshold (0 to 1) for fast loading.
            With the fast loading flag turned on, fast loading will be performed if the number of
            already read children over the total number of children is smaller than this threshold.
    :param trivial_size:
            The size (in bytes) considered trivial, around a dozen bytes.
            Objects smaller than this size are considered trivial.
            For a list of trivial objects, the container will be indexed in a blocked fashion.
    :param disable_gc:
            Flag to enable or disable garbage collection.
    :param simple_repr:
            Flag to enable or disable simple representation used in the __repr__ method.
            If turned on, __repr__ will not incur any disk I/O.
    :param copy_chunk_size:
            The size (in bytes) for the copy chunk.
    :param numpy_encoder:
            Flag to enable or disable the `numpy` support.
            If enabled, the `numpy` arrays will be encoded using the `dumps` method provided by `numpy`.
            The arrays are stored as binary data directly.
            If disabled, the `numpy` arrays will be converted to lists before encoding.
    :param magic:
            Magic bytes (max length: 30) to set, used to identify the file format version.
    """
    if (
        isinstance(small_obj_optimization_threshold, int)
        and small_obj_optimization_threshold > 0
    ):
        config.small_obj_optimization_threshold = small_obj_optimization_threshold
        if config.trivial_size > config.small_obj_optimization_threshold:
            config.trivial_size = config.small_obj_optimization_threshold

    if isinstance(write_buffer_size, int) and write_buffer_size > 0:
        config.write_buffer_size = write_buffer_size

    if isinstance(read_buffer_size, int) and read_buffer_size > 0:
        config.read_buffer_size = read_buffer_size

    if isinstance(fast_loading, bool):
        config.fast_loading = fast_loading

    if (
        isinstance(fast_loading_threshold, (int, float))
        and 0 <= fast_loading_threshold <= 1
    ):
        config.fast_loading_threshold = fast_loading_threshold

    if isinstance(trivial_size, int) and trivial_size > 0:
        config.trivial_size = trivial_size
        if config.trivial_size > config.small_obj_optimization_threshold:
            config.small_obj_optimization_threshold = config.trivial_size

    if isinstance(disable_gc, bool):
        config.disable_gc = disable_gc

    if isinstance(simple_repr, bool):
        config.simple_repr = simple_repr

    if isinstance(copy_chunk_size, int) and copy_chunk_size > 0:
        config.copy_chunk_size = copy_chunk_size

    if isinstance(numpy_encoder, bool):
        config.numpy_encoder = numpy_encoder

    if isinstance(magic, bytes) and 0 < len(magic) <= max_magic_len:
        from msglc import LazyWriter

        LazyWriter.set_magic(magic)