use of io.tiledb.java.api.Domain in project TileDB-Spark by TileDB-Inc.
the class TileDBBatch method planInputPartitions.
@Override
public InputPartition[] planInputPartitions() {
metricsUpdater.startTimer(dataSourcePlanBatchInputPartitionsTimerName);
ArrayList<InputPartition> readerPartitions = new ArrayList<>();
try {
Context ctx = new Context(tileDBDataSourceOptions.getTileDBConfigMap(true));
// Fetch the array and load its metadata
Array array = new Array(ctx, util.tryGetArrayURI(tileDBDataSourceOptions).toString());
HashMap<String, Pair> nonEmptyDomain = array.nonEmptyDomain();
Domain domain = array.getSchema().getDomain();
List<List<Range>> ranges = new ArrayList<>();
// Populate initial range list
for (int i = 0; i < domain.getNDim(); i++) {
ranges.add(new ArrayList<>());
}
for (int i = 0; i < array.getSchema().getAttributeNum(); i++) {
ranges.add(new ArrayList<>());
}
// Build range from all pushed filters
if (pushedFilters != null) {
for (Filter filter : pushedFilters) {
List<List<Range>> allRanges = buildRangeFromFilter(filter, nonEmptyDomain).getFirst();
for (int i = 0; i < allRanges.size(); i++) {
ranges.get(i).addAll(allRanges.get(i));
}
}
}
// For any existing ranges we try to merge into super ranges
for (int i = 0; i < domain.getNDim() + array.getSchema().getAttributeNum(); i++) {
List<Range> range = ranges.get(i);
if (range.isEmpty()) {
String columnName = this.tileDBReadSchema.getColumnName(i).get();
if (this.tileDBReadSchema.hasDimension(columnName)) {
range.add(new Range(nonEmptyDomain.get(columnName)));
} else {
range.add(new Range(true, new Pair(null, null)));
}
} else {
List<Range> mergedRanges = checkAndMergeRanges(range);
ranges.set(i, mergedRanges);
}
}
List<SubArrayRanges> subarrays = new ArrayList<>();
generateAllSubarrays(ranges.subList(0, (int) (domain.getNDim())), subarrays, 0, new ArrayList<>());
int availablePartitions = tileDBDataSourceOptions.getPartitionCount();
if (availablePartitions > 1) {
// Base case where we don't have any (or just single) pushdown per dimension
if (subarrays.size() == 1 && subarrays.get(0).splittable()) {
subarrays = subarrays.get(0).splitToPartitions(availablePartitions);
} else {
// Sort subarrays based on volume so largest volume is first
subarrays.sort(Collections.reverseOrder());
// Find median volume of subarrays;
SubArrayRanges medianSubarray = subarrays.get(subarrays.size() / 2);
Number medianVolume = medianSubarray.getVolume();
List<Integer> neededSplitsToReduceToMedianVolume = computeNeededSplitsToReduceToMedianVolume(subarrays.subList(0, subarrays.size() / 2), medianVolume, medianSubarray.getDatatype());
int sumOfNeededSplitsForEvenDistributed = neededSplitsToReduceToMedianVolume.stream().mapToInt(Integer::intValue).sum();
for (int i = 0; i < neededSplitsToReduceToMedianVolume.size(); i++) {
SubArrayRanges subarray = subarrays.get(i);
// Don't try to split unsplittable subarrays
if (!subarray.splittable()) {
continue;
}
/*
Imprecision with double division don't matter here we just want close enough percentages;
The weights are computed based on the percentage of needed splits to reduce to the median.
For example if we have 5 subarrays each with volumes of 10, 15, 50, 200, 400
The median is 50
The number of splits to reduce to the median is 400 / 50 = 8 AND 200 / 50 = 4
The weights are computed to be
8 / (8+4) = 0.66 and 4 / (8+4) = 0.33 for subarrays 400 and 200 respectively.
If the number of available splits is 3 (thus we can not give the full 8 + 4 splits needed for the median)
We do 0.66 * 3 = 2 splits to the 400 and 3 * 0.33 = 1 splits to the 200 subarray.
This results in a final subarray volumes of 10, 15, 50, 100, 100, 100, 100, 100, 100
*/
int numberOfWeightedSplits = (int) Math.ceil(neededSplitsToReduceToMedianVolume.get(i).doubleValue() / sumOfNeededSplitsForEvenDistributed * availablePartitions);
List<SubArrayRanges> splitSubarray = subarray.split(numberOfWeightedSplits);
subarrays.remove(i);
subarrays.addAll(splitSubarray);
}
}
}
List<List<Range>> attributeRanges = new ArrayList<>();
for (int i = ((int) domain.getNDim()); i < domain.getNDim() + array.getSchema().getAttributeNum(); i++) {
attributeRanges.add(ranges.get(i));
}
for (SubArrayRanges subarray : subarrays) {
// In the future we will be smarter about combining ranges to have partitions work on more
// than one range
// I.E. don't over partition like we probably are doing now
List<List<Range>> subarrayRanges = new ArrayList<>();
subarrayRanges.add(subarray.getRanges());
readerPartitions.add(new TileDBDataInputPartition(util.tryGetArrayURI(tileDBDataSourceOptions), tileDBReadSchema, tileDBDataSourceOptions, subarrayRanges, attributeRanges));
}
metricsUpdater.finish(dataSourcePlanBatchInputPartitionsTimerName);
InputPartition[] partitionsArray = new InputPartition[readerPartitions.size()];
partitionsArray = readerPartitions.toArray(partitionsArray);
array.close();
ctx.close();
domain.close();
return partitionsArray;
} catch (TileDBError tileDBError) {
log.log(ERROR, tileDBError.getMessage());
metricsUpdater.finish(dataSourcePlanBatchInputPartitionsTimerName);
}
return null;
}
use of io.tiledb.java.api.Domain in project TileDB-Spark by TileDB-Inc.
the class TileDBBatchWrite method writeArraySchema.
private static void writeArraySchema(Context ctx, URI uri, StructType sparkSchema, TileDBDataSourceOptions options) throws TileDBError {
ArrayType type = ArrayType.TILEDB_SPARSE;
try (ArraySchema arraySchema = new ArraySchema(ctx, type);
Domain domain = new Domain(ctx)) {
String[] dimNames = TileDBWriteSchema.getSchemaDimensionOptions(sparkSchema, options);
StructField[] sparkFields = sparkSchema.fields();
for (int dimIdx = 0; dimIdx < dimNames.length; dimIdx++) {
String dimName = dimNames[dimIdx];
int idx = sparkSchema.fieldIndex(dimName);
try (Dimension dim = TileDBWriteSchema.toDimension(ctx, dimName, dimIdx, sparkFields[idx].dataType(), options)) {
domain.addDimension(dim);
}
}
// set domain
arraySchema.setDomain(domain);
// add attributes
for (StructField field : sparkFields) {
// skip over dims
if (Arrays.stream(dimNames).anyMatch(field.name()::equals)) {
continue;
}
try (Attribute attr = TileDBWriteSchema.toAttribute(ctx, field, options)) {
arraySchema.addAttribute(attr);
}
}
// set schema tile / layouts and remaining attributes
Optional<Layout> schemaCellOrder = options.getSchemaCellOrder();
if (schemaCellOrder.isPresent()) {
arraySchema.setCellOrder(schemaCellOrder.get());
}
Optional<Layout> schemaTileOrder = options.getSchemaTileOrder();
if (schemaTileOrder.isPresent()) {
arraySchema.setTileOrder(schemaTileOrder.get());
}
// schema filters
Optional<List<Pair<String, Integer>>> coordsFilters = options.getSchemaCoordsFilterList();
if (coordsFilters.isPresent()) {
try (FilterList filterList = TileDBWriteSchema.createTileDBFilterList(ctx, coordsFilters.get())) {
arraySchema.setCoodsFilterList(filterList);
}
}
Optional<List<Pair<String, Integer>>> offsetsFilters = options.getSchemaOffsetsFilterList();
if (coordsFilters.isPresent()) {
try (FilterList filterList = TileDBWriteSchema.createTileDBFilterList(ctx, offsetsFilters.get())) {
arraySchema.setOffsetsFilterList(filterList);
}
}
// set capacity
Optional<Long> schemaCapacity = options.getSchemaCapacity();
if (schemaCapacity.isPresent()) {
arraySchema.setCapacity(schemaCapacity.get());
}
// set allows dups
if (options.getSchemaAllowDups().isPresent() && options.getSchemaAllowDups().get())
arraySchema.setAllowDups(1);
arraySchema.check();
Array.create(uri.toString(), arraySchema);
}
}
Aggregations