OK, this seems to be a more general bug with the .df method:
example data:
df = pd.DataFrame(
{
'pair':np.array(['BTC','BTC','BTC','BTC','BTC'],dtype=np.bytes_),
'expiry':[pd.Timestamp('2021-03-26 08:00:00'),pd.Timestamp('2021-03-26 08:00:00'),
pd.Timestamp('2021-03-26 08:00:00'),pd.Timestamp('2021-03-26 08:00:00'),
pd.Timestamp('2021-03-26 08:00:00')],
'strike':[48000.0,48000.0,48000.0,48000.0,48000.0],
'type':np.array(['P','P','P','P','P'],dtype=np.bytes_),
'time':[pd.Timestamp('2021-02-12 14:57:40'),pd.Timestamp('2021-02-12 14:57:40'),
pd.Timestamp('2021-02-13 11:11:44'),pd.Timestamp('2021-02-13 11:11:44'),
pd.Timestamp('2021-02-13 19:00:56')],
'id':[1125899906842668893,1125899906842668894,1125899906842668895,
1125899906842668896,1125899906842669911],
'price':[7640.60,7640.60,7675.31,7675.31,7500.00],
'qty':[-0.0500,0.0500,0.0100,-0.0100,0.1274],
'quoteQty':[-382.0300,382.0300,76.7531,-76.7531,955.5000],
'side':[-1,1,1,-1,1]
}
).set_index(['pair','expiry','strike','type','time'])
Create array:
uri = '/mnt/cloud-storage/data/options/Binance_vanilla'
tdb_config = tiledb.Config({
"sm.tile_cache_size":str(10_000_000),
"sm.consolidation.step_min_frags":"2",
"sm.consolidation.step_max_frags":"60",
"sm.consolidation.steps":"1",
"sm.consolidation.buffer_size":str(10_000_000),
"sm.consolidation.step_size_ratio": "0.5",
"sm.consolidation.mode":"fragment_meta",
"sm.num_reader_threads":"8",
"sm.num_writer_threads":"8"
})
ctx = tiledb.Ctx(tdb_config)
# Domain
dom = tiledb.Domain(
tiledb.Dim(name="pair", domain=(None,None), tile=None, dtype=np.bytes_),
tiledb.Dim(name="expiry", domain=(np.datetime64('1980-01-01'), np.datetime64("2100-01-01")),
tile=np.timedelta64(1, 'W'), dtype="datetime64[ns]"),
tiledb.Dim(name="strike", domain=(0, 9e18), tile=1e4, dtype=np.float64),
tiledb.Dim(name="type", domain=(None,None), tile=None, dtype=np.bytes_),
tiledb.Dim(name="time", domain=(np.datetime64('1980-01-01'), np.datetime64("2100-01-01")),
tile=np.timedelta64(1, 'W'), dtype="datetime64[ns]"),)
# List of available filters
bit_shuffle = tiledb.BitShuffleFilter()
byte_shuffle = tiledb.ByteShuffleFilter()
RLE = tiledb.RleFilter()
double_delta_encoding = tiledb.DoubleDeltaFilter()
positive_delta_encoding = tiledb.PositiveDeltaFilter()
bit_width_reduction = tiledb.BitWidthReductionFilter(window=int(1e3))
gzip = tiledb.GzipFilter(level=9)
lz4 = tiledb.LZ4Filter(level=9)
bzip2 = tiledb.Bzip2Filter(level=9)
zstd = tiledb.ZstdFilter(level=4)
# Attributes
attrs = [
tiledb.Attr(name="id",dtype=np.int64,ctx=ctx,
filters=tiledb.FilterList([double_delta_encoding,bit_width_reduction,zstd])),
tiledb.Attr(name="price",dtype=np.float64,ctx=ctx,
filters=tiledb.FilterList([zstd])),
tiledb.Attr(name="qty",dtype=np.float64,ctx=ctx,
filters=tiledb.FilterList([zstd])),
tiledb.Attr(name="quoteQty",dtype=np.float64,ctx=ctx,
filters=tiledb.FilterList([zstd])),
tiledb.Attr(name="side",dtype=np.int64,ctx=ctx,
filters=tiledb.FilterList([zstd])),
]
# Schema
schema = tiledb.ArraySchema(domain=dom, sparse=True,
attrs=attrs,coords_filters=[zstd],
cell_order="row-major",tile_order="row-major",
capacity=int(10e6),ctx=ctx,allows_duplicates=True)
if not os.path.exists(uri):
tiledb.SparseArray.create(uri,schema)
with tiledb.SparseArray(uri, mode='w') as A:
A[df.index.get_level_values('pair').values,
df.index.get_level_values('expiry').values,
df.index.get_level_values('strike').values,
df.index.get_level_values('type').values,
df.index.get_level_values('time').values] = {
"id":df.id.values,
"price":df.price.values,
"qty":df.qty.values,
"quoteQty":df.quoteQty.values,
"side":df.side.values
}
Read data:
# Works
uri = '/mnt/cloud-storage/data/options/Binance_vanilla'
with tiledb.SparseArray(uri, mode='r') as A:
df = A.query(attrs=('price',),index_col='time')[:]
pd.DataFrame(df)
# Works
uri = '/mnt/cloud-storage/data/options/Binance_vanilla'
with tiledb.SparseArray(uri, mode='r') as A:
df = A[:]
pd.DataFrame(df)
# Works
uri = '/mnt/cloud-storage/data/options/Binance_vanilla'
with tiledb.SparseArray(uri, mode='r') as A:
df = A.query(attrs=('price',),dims=['expiry','strike','time',],index_col='time').df[:]
df
# Doesn't work - Crashes
uri = '/mnt/cloud-storage/data/options/Binance_vanilla'
with tiledb.SparseArray(uri, mode='r') as A:
df = A.query(attrs=('price',),dims=['pair','expiry','strike','type','time',],index_col='time').df[:]
df
So, it seems like the .df method fails to return data with string coordinates.