Hello,
I would like to create a sparse array that has a variable length string attribute, but I’m currently reading back invalid values when filling the array based on the tiledb examples. So I’m wondering if I’m wrongly filling the array, or if this is perhaps a bug (I’m using tiledb python 0.7.4 with numpy 1.19.4, under python 3.8).
Here is a minimum reproducible example, including also an integer-valued variable length attribute that works as expected:
#
# Generic initialization and array creation
#
import tiledb
import numpy as np
array_name = "array_test"
def create_array(array_name):
ctx = tiledb.Ctx()
dom = tiledb.Domain(
tiledb.Dim(name="rows", domain=(None, None), dtype=np.bytes_),
tiledb.Dim(name="cols", domain=(None, None), dtype=np.bytes_),
ctx=ctx
)
attr_1 = tiledb.Attr(name="lab", var=True, dtype=np.bytes_, ctx=ctx)
attr_2 = tiledb.Attr(name="val", var=True, dtype=np.int64, ctx=ctx)
schema = tiledb.ArraySchema(domain=dom, sparse=True, attrs=[attr_1, attr_2], ctx=ctx)
tiledb.SparseArray.create(array_name, schema)
create_array(array_name)
Now, if the string attribute is set only with a single value per cell:
#
# Generate data for 2 cells: [(a, c), (b, d)]
#
keys_rows = ['a', 'b']
keys_cols = ['c', 'd']
# Single label per cell: this works as expected
labs = np.array([
np.array(['xx'], dtype=np.bytes_),
np.array(['zzz'], dtype=np.bytes_)
], dtype='O')
vals = np.array([
np.array([1, 2], dtype=np.int64),
np.array([3], dtype=np.int64)
], dtype='O')
then I get the expected result:
with tiledb.open(array_name, "w") as array:
array[keys_rows, keys_cols] = dict(lab=labs, val=vals)
with tiledb.SparseArray(array_name, mode='r') as arr:
out = arr['a', 'c']
# out
OrderedDict([('lab', array([b'xx'], dtype=object)),
('val', array([array([1, 2])], dtype=object)),
('rows', array([b'a'], dtype=object)),
('cols', array([b'c'], dtype=object))])
On the other hand, with multiple values for the string attribute:
#
# Generate data for 2 cells: [(a, c), (b, d)]
#
keys_rows = ['a', 'b']
keys_cols = ['c', 'd']
# Multiple labels per cell: invalid data returned
labs = np.array([
np.array(['xx', 'yyy'], dtype=np.bytes_),
np.array(['zzz', 'llll', 'ppppp'], dtype=np.bytes_)
], dtype='O')
vals = np.array([
np.array([1, 2], dtype=np.int64),
np.array([3], dtype=np.int64)
], dtype='O')
then I get invalid values when reading back:
# out
OrderedDict([('lab', array([b'\xf0\xde'], dtype=object)),
('val', array([array([1, 2])], dtype=object)),
('rows', array([b'a'], dtype=object)),
('cols', array([b'c'], dtype=object))])
Best,
Ian