I have a use case where I’m trying to associate some binary data and metadata (as attributes) using some string identifiers and timestamps as the domains.
Because the primary domain is a string identifier, I believe this requires that I use a sparse array - is this correct?
The code to define, open and write to the array is as follows:
def open_db():
# Check whether the array exists, and create if required
if not tiledb.object_type(array_name) == "array":
domain = tiledb.Domain(
tiledb.Dim(name="image_id", dtype="ascii"),
# other domains removed for simplicity
)
attributes = [
tiledb.Attr(name="image_data", dtype=np.bytes_, var=True),
# other attributes removed for simplicity
]
schema = tiledb.ArraySchema(domain=domain, sparse=True, attrs=attributes)
tiledb.Array.create(array_name, schema)
# Create and return a reference to the array object
return tiledb.SparseArray(array_name, mode="w")
def write_image(image_id, image_data, metadata):
with open_db() as A:
A[image_id] = {
"image_data": np.frombuffer(image_data, dtype=np.uint8),
}
Note that I have tried using np.uint8 as the image_data attribute datatype and other variations as well
When I try to store an image, I get the exception ValueError: value length (71027400) does not match coordinate length (1)
What is the correct way to store a binary blob in an array using string identifiers as one or more of the domains?
In order to make this work consistently for >= 1 cell, it is necessary to work around some of numpy’s object creation rules to force the assigned-from array to always have object dtype (see eg this discussion).
Here’s a working, modified example:
import tiledb
import numpy as np
import random
import tempfile
array_name = tempfile.mkdtemp()
def open_db():
# Check whether the array exists, and create if required
if not tiledb.object_type(array_name) == "array":
domain = tiledb.Domain(
tiledb.Dim(name="image_id", dtype="ascii"),
# other domains removed for simplicity
)
attributes = [
tiledb.Attr(name="image_data", dtype=np.bytes_, var=True),
# other attributes removed for simplicity
]
schema = tiledb.ArraySchema(domain=domain, sparse=True, attrs=attributes)
tiledb.Array.create(array_name, schema)
# Create and return a reference to the array object
return tiledb.SparseArray(array_name, mode="w")
def write_image(coords, image_data, metadata):
with open_db() as A:
A[coords] = {
"image_data": np.array([image_data], dtype="O")
}
if __name__ == "__main__":
# This doesn't work, because the inner array is forced to object dtype as well
img_data = np.array([np.random.rand(random.randint(500,5000)) for _ in range(1)], dtype="O")
coords = ["image1"]
try:
write_image(coords, img_data, {})
except Exception as exc:
print("single cell write failed: ", exc)
# This works:
img_data = np.array([np.random.rand(random.randint(500,5000)) for _ in range(2)], dtype="O")
coords = ["image2", "image3"]
write_image(coords, img_data, {})
# General solution for 1+ members: create empty object array, then assign.
coords = ["image4"]
img_data = [np.array([np.random.rand(random.randint(500,5000))], dtype="O")]
data_array = np.empty(len(img_data), dtype="O")
data_array[:] = img_data
write_image(coords, data_array, {})