Query missing/NaN values

Query Conditions | TileDB Embedded Docs does not provide examples of querying for missing (NaN) values.
How does one perform the equivalent of isna() or == None for querying in tiledb?

1 Like

Hi @nick-youngblut

We don’t support that directly in our core engine. Since I suspect this is SOMA related, you can do something like the following to get the coords from obs that can then be passed into the subsequent search ExperimentAxisQuery:

from typing import List

import tiledbsoma 
import tiledb


def find_nonnull_obs(
    soma_uri: str,
    attr: str,
) -> List[int]:
    
    with tiledbsoma.Experiment.open(soma_uri) as exp:
        obs_uri = exp.obs.uri

    with tiledb.open(obs_uri) as arr:
        obs_df = arr.df[:]  # can filter here with other query conditions for a smaller return size e.g. arr.query().df[:]

    null_joinids = obs_df.loc[~obs_df[attr].isnull(), "soma_joinid"]

    return null_joinids.tolist()


if __name__ == "__main__":

    non_null_joinids = find_nonnull_obs(
        uri,
        "tissue",
    )

    with tiledbsoma.Experiment.open(uri) as exp:

        print(f"n cells all: {exp.obs.count}")

        query = exp.axis_query(
            measurement_name="RNA",
            obs_query=tiledbsoma.AxisQuery(coords=(non_null_joinids,))
        )

        print(f"n cells non-null: {query.n_obs}")
        print(f"proportion null: {(exp.obs.count - query.n_obs) / exp.obs.count}")
1 Like