How can we store variable length binary blobs?

I have a use case where I’m trying to associate some binary data and metadata (as attributes) using some string identifiers and timestamps as the domains.

Because the primary domain is a string identifier, I believe this requires that I use a sparse array - is this correct?

The code to define, open and write to the array is as follows:

def open_db():
    # Check whether the array exists, and create if required
    if not tiledb.object_type(array_name) == "array":
        domain = tiledb.Domain(
            tiledb.Dim(name="image_id", dtype="ascii"),
            # other domains removed for simplicity
        )

        attributes = [
            tiledb.Attr(name="image_data", dtype=np.bytes_, var=True),
            # other attributes removed for simplicity
        ]

        schema = tiledb.ArraySchema(domain=domain, sparse=True, attrs=attributes)

        tiledb.Array.create(array_name, schema)

    # Create and return a reference to the array object
    return tiledb.SparseArray(array_name, mode="w")

def write_image(image_id, image_data, metadata):
    with open_db() as A:
        A[image_id] = {
            "image_data": np.frombuffer(image_data, dtype=np.uint8),
        }

Note that I have tried using np.uint8 as the image_data attribute datatype and other variations as well

When I try to store an image, I get the exception ValueError: value length (71027400) does not match coordinate length (1)

What is the correct way to store a binary blob in an array using string identifiers as one or more of the domains?

Hi @hotplot,

In order to make this work consistently for >= 1 cell, it is necessary to work around some of numpy’s object creation rules to force the assigned-from array to always have object dtype (see eg this discussion).

Here’s a working, modified example:

import tiledb
import numpy as np
import random
import tempfile

array_name = tempfile.mkdtemp()

def open_db():
    # Check whether the array exists, and create if required
    if not tiledb.object_type(array_name) == "array":
        domain = tiledb.Domain(
            tiledb.Dim(name="image_id", dtype="ascii"),
            # other domains removed for simplicity
        )

        attributes = [
            tiledb.Attr(name="image_data", dtype=np.bytes_, var=True),
            # other attributes removed for simplicity
        ]

        schema = tiledb.ArraySchema(domain=domain, sparse=True, attrs=attributes)

        tiledb.Array.create(array_name, schema)

    # Create and return a reference to the array object
    return tiledb.SparseArray(array_name, mode="w")

def write_image(coords, image_data, metadata):
    with open_db() as A:
        A[coords] = {
            "image_data": np.array([image_data], dtype="O")
        }

if __name__ == "__main__":
    # This doesn't work, because the inner array is forced to object dtype as well
    img_data = np.array([np.random.rand(random.randint(500,5000)) for _ in range(1)], dtype="O")
    coords = ["image1"]
    try:
        write_image(coords, img_data, {})
    except Exception as exc:
        print("single cell write failed: ", exc)

    # This works:
    img_data = np.array([np.random.rand(random.randint(500,5000)) for _ in range(2)], dtype="O")
    coords = ["image2", "image3"]
    write_image(coords, img_data, {})

    # General solution for 1+ members: create empty object array, then assign.
    coords = ["image4"]
    img_data = [np.array([np.random.rand(random.randint(500,5000))], dtype="O")]
    data_array = np.empty(len(img_data), dtype="O")
    data_array[:] = img_data
    write_image(coords, data_array, {})

HTH,
Isaiah

1 Like

Thanks Isaiah, that resolved the issue :slight_smile:

1 Like