it seems there is huge performance difference between dense writes and sparse writes on dense arrays
import numpy as np
import tiledb
import time
dom = tiledb.Domain(
tiledb.Dim(
name="seq",
domain=(0, 10000000),
tile=10000,
dtype=np.int32,
),
)
attr = tiledb.Attr(name="", dtype=np.float32)
schema = tiledb.ArraySchema(domain=dom, attrs=[attr])
tiledb.DenseArray.create("test1.tdb", schema)
from numpy.random import default_rng
a = np.random.random(7000000)
rng = default_rng()
numbers = rng.choice(7000000, size=6000000, replace=False)
t0 = time.time()
with tiledb.DenseArray("test1.tdb", "w") as A:
A[numbers] = a[:6000000]
t1 = time.time()
print(f"sparse writes takes {t1-t0} seconds")
with tiledb.DenseArray("test1.tdb", "r") as A:
b = A[:7000000]
t2 = time.time()
print(f"read for sparsely written array takes {t2 - t1} seconds")
print(b)
tiledb.DenseArray.create("test2.tdb", schema)
t0 = time.time()
with tiledb.DenseArray("test2.tdb", "w") as A:
A[:6000000] = a[:6000000]
t1 = time.time()
print(f"dense writes take {t1-t0} seconds")
with tiledb.DenseArray("test2.tdb", "r") as A:
b = A[:7000000]
t2 = time.time()
print(f"read for densely written array takes {t2 - t1} seconds")
print(b)
Result:
sparse writes takes 1.9559354782104492 seconds
read for sparsely written array take 1.9144158363342285 seconds
[0.72725064 nan 0.7601014 ... 0.15054736 0.8090051 0.11344466]
dense writes take 0.09776735305786133 seconds
read for densely written array takes 0.10718464851379395 seconds
[0.59625846 0.34717634 0.61473554 ... nan nan nan]
Seems there is 20x performance difference for both writing and reading.
is there any documentation on this?
Thanks in advance!