Thanks @stavros!
Number of arrays
2116
Total Number of fragments
Too many to count with walkdir, but it’s between 100-10’000 fragments per array where most fragments are just a few kb in size, but can also be several GB if previously consolidated.
Schemas
Schema 1
TileDB Config
config = tiledb.Config()
config["sm.num_reader_threads"] = "8"
config["sm.num_writer_threads"] = "8"
# Context
ctx = tiledb.Ctx(config)
# Domain
dom = tiledb.Domain(
# tiles = 1 cent increment
tiledb.Dim(ctx=ctx,name="agg_ID", domain=(0, 9e18), tile=1e6, dtype=np.int64),
# tiles = 1 day increment
# tiledb.Dim(ctx=ctx,name="date", domain=(0, 9e18), tile=86.4e6, dtype=np.int64))
tiledb.Dim(name="date", domain=(np.datetime64('1980-01-01'), np.datetime64("2100-01-01")),
tile=np.timedelta64(1, 'D'), dtype="datetime64[ns]"))
# List of available filters
bit_shuffle = tiledb.BitShuffleFilter()
byte_shuffle = tiledb.ByteShuffleFilter()
RLE = tiledb.RleFilter()
double_delta_encoding = tiledb.DoubleDeltaFilter()
positive_delta_encoding = tiledb.PositiveDeltaFilter()
bit_width_reduction = tiledb.BitWidthReductionFilter(window=int(1e3))
gzip = tiledb.GzipFilter(level=9)
lz4 = tiledb.LZ4Filter(level=9)
bzip2 = tiledb.Bzip2Filter(level=9)
zstd = tiledb.ZstdFilter(level=9)
# Attributes
attrs= [
tiledb.Attr(name=i,dtype=np.float64,ctx=ctx,
filters=tiledb.FilterList([zstd])) for i in df.iloc[:,2:].columns
]
# Schema
schema = tiledb.ArraySchema(domain=dom, sparse=True,
attrs=attrs,
cell_order="row-major",tile_order="row-major",
capacity=int(10e6),ctx=ctx)
Example:
Schema 2
# TileDB Config
config = tiledb.Config()
config["sm.num_reader_threads"] = "8"
config["sm.num_writer_threads"] = "8"
# Context
ctx = tiledb.Ctx(config)
# Domain
dom = tiledb.Domain(
# tiles = 1 cent increment
tiledb.Dim(ctx=ctx,name="agg_ID", domain=(0, 9e18), tile=1e6, dtype=np.int64),
# tiles = 1 day increment
# tiledb.Dim(ctx=ctx,name="date", domain=(0, 9e18), tile=86.4e6, dtype=np.int64))
tiledb.Dim(name="date", domain=(np.datetime64('1980-01-01'), np.datetime64("2100-01-01")),
tile=np.timedelta64(1, 'D'), dtype="datetime64[ns]"))
# List of available filters
bit_shuffle = tiledb.BitShuffleFilter()
byte_shuffle = tiledb.ByteShuffleFilter()
RLE = tiledb.RleFilter()
double_delta_encoding = tiledb.DoubleDeltaFilter()
positive_delta_encoding = tiledb.PositiveDeltaFilter()
bit_width_reduction = tiledb.BitWidthReductionFilter(window=int(1e3))
gzip = tiledb.GzipFilter(level=9)
lz4 = tiledb.LZ4Filter(level=9)
bzip2 = tiledb.Bzip2Filter(level=9)
zstd = tiledb.ZstdFilter(level=9)
# Attributes
attrs = [
tiledb.Attr(name="price",dtype=np.float64,ctx=ctx,
filters=tiledb.FilterList([zstd])),
tiledb.Attr(name="volume",dtype=np.float64,ctx=ctx,
filters=tiledb.FilterList([zstd])),
tiledb.Attr(name="first_trade_ID",dtype=np.int64,ctx=ctx,
filters=tiledb.FilterList([zstd])),
tiledb.Attr(name="last_trade_ID",dtype=np.int64,ctx=ctx,
filters=tiledb.FilterList([zstd])),
tiledb.Attr(name="is_buyer_maker",dtype=np.int8,ctx=ctx,
filters=tiledb.FilterList([bit_shuffle,zstd])),
tiledb.Attr(name="is_best_price_match",dtype=np.int8,ctx=ctx,
filters=tiledb.FilterList([bit_shuffle,zstd]))
]
# Schema
schema = tiledb.ArraySchema(domain=dom, sparse=True,
attrs=attrs,
cell_order="row-major",tile_order="row-major",
capacity=int(10e6),ctx=ctx)
Example:
Schema 3
price_tile = 1e-3 if ticker.split("_")[1]=="BTC" else 1e3
# Domain
dom = tiledb.Domain(
# tiles = 1 cent increment
tiledb.Dim(name="price", domain=(0, 1e9), tile=price_tile, dtype=np.float64),
# tiles = 1 day increment
tiledb.Dim(name="date", domain=(np.datetime64('1980-01-01'), np.datetime64("2100-01-01")),
tile=np.timedelta64(1, 'D'), dtype="datetime64[ns]"))
# List of available filters
bit_shuffle = tiledb.BitShuffleFilter()
byte_shuffle = tiledb.ByteShuffleFilter()
RLE = tiledb.RleFilter()
double_delta_encoding = tiledb.DoubleDeltaFilter()
positive_delta_encoding = tiledb.PositiveDeltaFilter()
bit_width_reduction = tiledb.BitWidthReductionFilter(window=int(1e3))
gzip = tiledb.GzipFilter(level=9)
lz4 = tiledb.LZ4Filter(level=9)
bzip2 = tiledb.Bzip2Filter(level=9)
zstd = tiledb.ZstdFilter(level=5)
# Attributes
attrs = [
tiledb.Attr(name="quantity",dtype=np.float64,
filters=tiledb.FilterList([byte_shuffle,zstd])
),
]
# Schema
# Capacity needs to be low as not to overwhelm the tileDB buffer
schema = tiledb.ArraySchema(domain=dom, sparse=True,
attrs=attrs,
cell_order="col-major",tile_order="col-major",
capacity=int(5e3))
Example:
long-format (as stored)

wide-format (transformed)
(Old) Notebook with some data to play around with:
https://1drv.ms/u/s!ArP7_EkyioIBwqsYPCCBmMJvJVYv4A?e=iHWLah