Getting VCF samples and headers with Java API

I am learning to use the Java API to TileDB-VCF to read data from a TileDB vcf database. Using examples from the test code, I am able to read data from the data array. Is there a way to use the Java API to get a list of sample names and get the headers for them? Alternatively, is it possible to use TileDB-Java to get that information?

Hi @pjbradbury,

Here is the code showcasing how you can use TileDB-Java to get a list of sample names and their headers.

Please let us know if you have any further questions.

public ArrayList<String> getSamples() throws TileDBError {
  // Open array
  Context context = new Context();
  Array array = new Array(context, URIString("vcf_headers"), TILEDB_READ);
  Query query = new Query(array, TILEDB_READ);

  // alloc buffers
  int bufferSize = 10000;
  allocBuffers(query, context, bufferSize, "sample", TILEDB_STRING_ASCII);

  byte[] data;
  long[] offsets;
  ArrayList<String> samples = new ArrayList<>();

  // submit query
  do {
    query.submit();
    data = (byte[]) query.getBuffer("sample");
    offsets = query.getOffsetsBuffer("sample");
    addSamplesToResult(samples, offsets, data);
    if (data.length == 0 && query.getQueryStatus() == QueryStatus.TILEDB_INCOMPLETE) {
      // if buffer size is small increase the size by a factor of 2
      allocBuffers(query, context, 2 * bufferSize, "sample", TILEDB_STRING_ASCII);
    }
  } while (query.getQueryStatus() == QueryStatus.TILEDB_INCOMPLETE); // run until query complete
  context.close();
  array.close();
  query.close();
  return samples;
}

/**
 * @param samples The sample name to get the header of
 */
public String getHeader(String sample) throws TileDBError {
  // open array
  Context context = new Context();
  Array array = new Array(context, URIString("vcf_headers"), TILEDB_READ);

  // slice on a specific sample
  SubArray subArray = new SubArray(ctx, array);
  subArray.addRangeVar(0, sample, sample);
  Query query = new Query(array, TILEDB_READ);
  query.setSubarray(subArray);

  // alloc buffers
  int bufferSize = 1000000;
  allocBuffers(query, context, bufferSize, "header", TILEDB_CHAR);

  byte[] data;
  long[] offsets;
  String[] headerBatch;

  // submit query
  do {
    query.submit();
    data = (byte[]) query.getBuffer("header");
    if (data.length == 0 && query.getQueryStatus() == QueryStatus.TILEDB_INCOMPLETE) {
      // if buffer size is small increase the size by a factor of 2
      bufferSize = bufferSize * 2;
      allocBuffers(query, context, bufferSize, "header", TILEDB_CHAR);
    }
  } while (query.getQueryStatus() == QueryStatus.TILEDB_INCOMPLETE); // run until query complete

  context.close();
  array.close();
  query.close();
  return new String(data);
}


// ==================================== Helper methods ==================================== 


/**
 * Adds a batch of samples to the given arraylist
 * @param samples The arraylist that contains all samples
 * @param offsets the batch offsets
 * @param data the batch data
 */
private void addSamplesToResult(ArrayList samples, long[] offsets, byte[] data) {
  int start = 0, end;

  // Convert bytes to string array
  for (int i = 0; i < offsets.length; ++i) {
    if (i < offsets.length - 1) {
      end = (int) offsets[i + 1];
      samples.add(new String(Arrays.copyOfRange(data, start, end)));
      start = end;
    } else {
      end = data.length;
      samples.add(new String(Arrays.copyOfRange(data, start, end)));
    }
  }
}


/**
 * Allocate buffers
 * @param query The query
 * @param context The context
 * @param bufferSize The buffer size to allocate
 * @param bufferName The name of the attribute/dimension to be read
 * @param datatype The datatype of the attribute/dimension
 */
private void allocBuffers(Query query, Context context, int bufferSize, String bufferName, Datatype datatype) {
  try {
    query.setDataBuffer(bufferName,  new NativeArray(context, bufferSize, datatype));
    query.setOffsetsBuffer(bufferName, new NativeArray(context, bufferSize, TILEDB_UINT64));
  } catch (TileDBError e) {
    throw new RuntimeException(e);
  }
}


private String URIString(String arrayName) {
  Path arraysPath = Paths.get(arrayName);
  return "file://".concat(arraysPath.toAbsolutePath().toString());
}

Best,
Dimitris

Thanks! I have it working.