dtype='|S0' not working with A.df[:]

I ingested a multindex dataframe as follows:

tiledb.from_pandas('Binance_VOptions',df.reset_index().set_index(['time','pair','type','strike','expiry',]))

This resulted in the following schema:

ArraySchema(
  domain=Domain(*[
    Dim(name='time', domain=(numpy.datetime64('2021-01-15T07:52:12.000000000'), numpy.datetime64('2021-02-15T22:14:31.000000000')), tile=10 nanoseconds, dtype='datetime64[ns]'),
    Dim(name='pair', domain=(None, None), tile=None, dtype='|S0'),
    Dim(name='type', domain=(None, None), tile=None, dtype='|S0'),
    Dim(name='strike', domain=(24000.0, 64000.0), tile=10.0, dtype='float64'),
    Dim(name='expiry', domain=(numpy.datetime64('2021-02-19T00:00:00.000000000'), numpy.datetime64('2021-03-26T00:00:00.000000000')), tile=10 nanoseconds, dtype='datetime64[ns]'),
  ]),
  attrs=[
    Attr(name='id', dtype='int64', var=False, filters=FilterList([ZstdFilter(level=1), ])),
    Attr(name='price', dtype='float64', var=False, filters=FilterList([ZstdFilter(level=1), ])),
    Attr(name='qty', dtype='float64', var=False, filters=FilterList([ZstdFilter(level=1), ])),
    Attr(name='quoteQty', dtype='float64', var=False, filters=FilterList([ZstdFilter(level=1), ])),
    Attr(name='side', dtype='int64', var=False, filters=FilterList([ZstdFilter(level=1), ])),
    Attr(name='contract', dtype='<U0', var=True, filters=FilterList([ZstdFilter(level=1), ])),
    Attr(name='CVD', dtype='float64', var=False, filters=FilterList([ZstdFilter(level=1), ])),
  ],
  cell_order='row-major',
  tile_order='row-major',
  capacity=10000,
  sparse=True,
  allows_duplicates=True,
  coords_filters=FilterList([ZstdFilter(level=-1), ])
)

But it looks like dimensions of type ‘|S0’ can’t be read, using the .df[:] method (A[:] works):

with tiledb.open('Binance_VOptions','r') as A:
    data = A.query(use_arrow=True,dims=('time','pair','type','strike','expiry',)).df[:]
data

By removing ‘pair’, and ‘type’ from the above query, the code works. Otherwise the script crashes without errors.

tiledb version: 0.8.2

OK, this seems to be a more general bug with the .df method:

example data:

df = pd.DataFrame(
    {
        'pair':np.array(['BTC','BTC','BTC','BTC','BTC'],dtype=np.bytes_),
        'expiry':[pd.Timestamp('2021-03-26 08:00:00'),pd.Timestamp('2021-03-26 08:00:00'),
                    pd.Timestamp('2021-03-26 08:00:00'),pd.Timestamp('2021-03-26 08:00:00'),
                    pd.Timestamp('2021-03-26 08:00:00')],
        'strike':[48000.0,48000.0,48000.0,48000.0,48000.0],
        'type':np.array(['P','P','P','P','P'],dtype=np.bytes_),
        'time':[pd.Timestamp('2021-02-12 14:57:40'),pd.Timestamp('2021-02-12 14:57:40'),
                    pd.Timestamp('2021-02-13 11:11:44'),pd.Timestamp('2021-02-13 11:11:44'),
                    pd.Timestamp('2021-02-13 19:00:56')],
        'id':[1125899906842668893,1125899906842668894,1125899906842668895,
              1125899906842668896,1125899906842669911],
        'price':[7640.60,7640.60,7675.31,7675.31,7500.00],
        'qty':[-0.0500,0.0500,0.0100,-0.0100,0.1274],
        'quoteQty':[-382.0300,382.0300,76.7531,-76.7531,955.5000],
        'side':[-1,1,1,-1,1]
    }
).set_index(['pair','expiry','strike','type','time'])

Create array:

uri = '/mnt/cloud-storage/data/options/Binance_vanilla'

tdb_config = tiledb.Config({
    "sm.tile_cache_size":str(10_000_000),
    "sm.consolidation.step_min_frags":"2",
    "sm.consolidation.step_max_frags":"60",
    "sm.consolidation.steps":"1",
    "sm.consolidation.buffer_size":str(10_000_000),
    "sm.consolidation.step_size_ratio": "0.5",
    "sm.consolidation.mode":"fragment_meta",
    "sm.num_reader_threads":"8",
    "sm.num_writer_threads":"8"
})
ctx = tiledb.Ctx(tdb_config)

# Domain
dom = tiledb.Domain(
    tiledb.Dim(name="pair", domain=(None,None), tile=None, dtype=np.bytes_),
    tiledb.Dim(name="expiry", domain=(np.datetime64('1980-01-01'), np.datetime64("2100-01-01")),
               tile=np.timedelta64(1, 'W'), dtype="datetime64[ns]"),
    tiledb.Dim(name="strike", domain=(0, 9e18), tile=1e4, dtype=np.float64),
    tiledb.Dim(name="type", domain=(None,None), tile=None, dtype=np.bytes_),
    tiledb.Dim(name="time", domain=(np.datetime64('1980-01-01'), np.datetime64("2100-01-01")),
               tile=np.timedelta64(1, 'W'), dtype="datetime64[ns]"),)

# List of available filters
bit_shuffle = tiledb.BitShuffleFilter()
byte_shuffle = tiledb.ByteShuffleFilter()
RLE = tiledb.RleFilter()
double_delta_encoding = tiledb.DoubleDeltaFilter()
positive_delta_encoding = tiledb.PositiveDeltaFilter()
bit_width_reduction = tiledb.BitWidthReductionFilter(window=int(1e3))
gzip = tiledb.GzipFilter(level=9)
lz4 = tiledb.LZ4Filter(level=9)
bzip2 = tiledb.Bzip2Filter(level=9)
zstd = tiledb.ZstdFilter(level=4)

# Attributes
attrs = [
    tiledb.Attr(name="id",dtype=np.int64,ctx=ctx,
                filters=tiledb.FilterList([double_delta_encoding,bit_width_reduction,zstd])),
    tiledb.Attr(name="price",dtype=np.float64,ctx=ctx,
               filters=tiledb.FilterList([zstd])),
    tiledb.Attr(name="qty",dtype=np.float64,ctx=ctx,
               filters=tiledb.FilterList([zstd])),
    tiledb.Attr(name="quoteQty",dtype=np.float64,ctx=ctx,
               filters=tiledb.FilterList([zstd])),
    tiledb.Attr(name="side",dtype=np.int64,ctx=ctx,
       filters=tiledb.FilterList([zstd])),
]
# Schema
schema = tiledb.ArraySchema(domain=dom, sparse=True,
                            attrs=attrs,coords_filters=[zstd],
                            cell_order="row-major",tile_order="row-major",
                            capacity=int(10e6),ctx=ctx,allows_duplicates=True)

if not os.path.exists(uri):
    tiledb.SparseArray.create(uri,schema)
with tiledb.SparseArray(uri, mode='w') as A:
    A[df.index.get_level_values('pair').values,
      df.index.get_level_values('expiry').values,
      df.index.get_level_values('strike').values,
      df.index.get_level_values('type').values,
      df.index.get_level_values('time').values] = {
        "id":df.id.values,
        "price":df.price.values,
        "qty":df.qty.values,
        "quoteQty":df.quoteQty.values,
        "side":df.side.values
    }

Read data:

# Works
uri = '/mnt/cloud-storage/data/options/Binance_vanilla'
with tiledb.SparseArray(uri, mode='r') as A:
    df = A.query(attrs=('price',),index_col='time')[:]
pd.DataFrame(df)

# Works
uri = '/mnt/cloud-storage/data/options/Binance_vanilla'
with tiledb.SparseArray(uri, mode='r') as A:
    df = A[:]
pd.DataFrame(df)

# Works
uri = '/mnt/cloud-storage/data/options/Binance_vanilla'
with tiledb.SparseArray(uri, mode='r') as A:
    df = A.query(attrs=('price',),dims=['expiry','strike','time',],index_col='time').df[:]
df

# Doesn't work - Crashes
uri = '/mnt/cloud-storage/data/options/Binance_vanilla'
with tiledb.SparseArray(uri, mode='r') as A:
    df = A.query(attrs=('price',),dims=['pair','expiry','strike','type','time',],index_col='time').df[:]
df

So, it seems like the .df method fails to return data with string coordinates.

Hi @Mtrl_Scientist, as mentioned in the related issue, we’ve released a fix for this issue in TileDB 2.2.7 / TileDB-Py 0.8.6, available on conda and PyPI. I’ve tested your code and confirmed the fix, but please let us know of any additional issues.

1 Like

Awesome work, thanks! :+1: