Search in sources :

Example 1 with Domain

use of io.tiledb.java.api.Domain in project TileDB-Spark by TileDB-Inc.

the class TileDBBatch method planInputPartitions.

@Override
public InputPartition[] planInputPartitions() {
    metricsUpdater.startTimer(dataSourcePlanBatchInputPartitionsTimerName);
    ArrayList<InputPartition> readerPartitions = new ArrayList<>();
    try {
        Context ctx = new Context(tileDBDataSourceOptions.getTileDBConfigMap(true));
        // Fetch the array and load its metadata
        Array array = new Array(ctx, util.tryGetArrayURI(tileDBDataSourceOptions).toString());
        HashMap<String, Pair> nonEmptyDomain = array.nonEmptyDomain();
        Domain domain = array.getSchema().getDomain();
        List<List<Range>> ranges = new ArrayList<>();
        // Populate initial range list
        for (int i = 0; i < domain.getNDim(); i++) {
            ranges.add(new ArrayList<>());
        }
        for (int i = 0; i < array.getSchema().getAttributeNum(); i++) {
            ranges.add(new ArrayList<>());
        }
        // Build range from all pushed filters
        if (pushedFilters != null) {
            for (Filter filter : pushedFilters) {
                List<List<Range>> allRanges = buildRangeFromFilter(filter, nonEmptyDomain).getFirst();
                for (int i = 0; i < allRanges.size(); i++) {
                    ranges.get(i).addAll(allRanges.get(i));
                }
            }
        }
        // For any existing ranges we try to merge into super ranges
        for (int i = 0; i < domain.getNDim() + array.getSchema().getAttributeNum(); i++) {
            List<Range> range = ranges.get(i);
            if (range.isEmpty()) {
                String columnName = this.tileDBReadSchema.getColumnName(i).get();
                if (this.tileDBReadSchema.hasDimension(columnName)) {
                    range.add(new Range(nonEmptyDomain.get(columnName)));
                } else {
                    range.add(new Range(true, new Pair(null, null)));
                }
            } else {
                List<Range> mergedRanges = checkAndMergeRanges(range);
                ranges.set(i, mergedRanges);
            }
        }
        List<SubArrayRanges> subarrays = new ArrayList<>();
        generateAllSubarrays(ranges.subList(0, (int) (domain.getNDim())), subarrays, 0, new ArrayList<>());
        int availablePartitions = tileDBDataSourceOptions.getPartitionCount();
        if (availablePartitions > 1) {
            // Base case where we don't have any (or just single) pushdown per dimension
            if (subarrays.size() == 1 && subarrays.get(0).splittable()) {
                subarrays = subarrays.get(0).splitToPartitions(availablePartitions);
            } else {
                // Sort subarrays based on volume so largest volume is first
                subarrays.sort(Collections.reverseOrder());
                // Find median volume of subarrays;
                SubArrayRanges medianSubarray = subarrays.get(subarrays.size() / 2);
                Number medianVolume = medianSubarray.getVolume();
                List<Integer> neededSplitsToReduceToMedianVolume = computeNeededSplitsToReduceToMedianVolume(subarrays.subList(0, subarrays.size() / 2), medianVolume, medianSubarray.getDatatype());
                int sumOfNeededSplitsForEvenDistributed = neededSplitsToReduceToMedianVolume.stream().mapToInt(Integer::intValue).sum();
                for (int i = 0; i < neededSplitsToReduceToMedianVolume.size(); i++) {
                    SubArrayRanges subarray = subarrays.get(i);
                    // Don't try to split unsplittable subarrays
                    if (!subarray.splittable()) {
                        continue;
                    }
                    /*
             Imprecision with double division don't matter here we just want close enough percentages;

             The weights are computed based on the percentage of needed splits to reduce to the median.
             For example if we have 5 subarrays each with volumes of 10, 15, 50, 200, 400
             The median is 50
             The number of splits to reduce to the median is 400 / 50 = 8 AND 200 / 50 = 4
             The weights are computed to be
             8 / (8+4) = 0.66 and 4 / (8+4) = 0.33 for subarrays 400 and 200 respectively.
             If the number of available splits is 3 (thus we can not give the full 8 + 4 splits needed for the median)
             We do 0.66 * 3 = 2 splits to the 400 and 3 * 0.33 = 1 splits to the 200 subarray.
             This results in a final subarray volumes of 10, 15, 50, 100, 100, 100, 100, 100, 100
            */
                    int numberOfWeightedSplits = (int) Math.ceil(neededSplitsToReduceToMedianVolume.get(i).doubleValue() / sumOfNeededSplitsForEvenDistributed * availablePartitions);
                    List<SubArrayRanges> splitSubarray = subarray.split(numberOfWeightedSplits);
                    subarrays.remove(i);
                    subarrays.addAll(splitSubarray);
                }
            }
        }
        List<List<Range>> attributeRanges = new ArrayList<>();
        for (int i = ((int) domain.getNDim()); i < domain.getNDim() + array.getSchema().getAttributeNum(); i++) {
            attributeRanges.add(ranges.get(i));
        }
        for (SubArrayRanges subarray : subarrays) {
            // In the future we will be smarter about combining ranges to have partitions work on more
            // than one range
            // I.E. don't over partition like we probably are doing now
            List<List<Range>> subarrayRanges = new ArrayList<>();
            subarrayRanges.add(subarray.getRanges());
            readerPartitions.add(new TileDBDataInputPartition(util.tryGetArrayURI(tileDBDataSourceOptions), tileDBReadSchema, tileDBDataSourceOptions, subarrayRanges, attributeRanges));
        }
        metricsUpdater.finish(dataSourcePlanBatchInputPartitionsTimerName);
        InputPartition[] partitionsArray = new InputPartition[readerPartitions.size()];
        partitionsArray = readerPartitions.toArray(partitionsArray);
        array.close();
        ctx.close();
        domain.close();
        return partitionsArray;
    } catch (TileDBError tileDBError) {
        log.log(ERROR, tileDBError.getMessage());
        metricsUpdater.finish(dataSourcePlanBatchInputPartitionsTimerName);
    }
    return null;
}
Also used : TaskContext(org.apache.spark.TaskContext) Context(io.tiledb.java.api.Context) ArrayList(java.util.ArrayList) Array(io.tiledb.java.api.Array) Filter(org.apache.spark.sql.sources.Filter) TileDBError(io.tiledb.java.api.TileDBError) ArrayList(java.util.ArrayList) List(java.util.List) InputPartition(org.apache.spark.sql.connector.read.InputPartition) Domain(io.tiledb.java.api.Domain) Pair(io.tiledb.java.api.Pair)

Example 2 with Domain

use of io.tiledb.java.api.Domain in project TileDB-Spark by TileDB-Inc.

the class TileDBBatchWrite method writeArraySchema.

private static void writeArraySchema(Context ctx, URI uri, StructType sparkSchema, TileDBDataSourceOptions options) throws TileDBError {
    ArrayType type = ArrayType.TILEDB_SPARSE;
    try (ArraySchema arraySchema = new ArraySchema(ctx, type);
        Domain domain = new Domain(ctx)) {
        String[] dimNames = TileDBWriteSchema.getSchemaDimensionOptions(sparkSchema, options);
        StructField[] sparkFields = sparkSchema.fields();
        for (int dimIdx = 0; dimIdx < dimNames.length; dimIdx++) {
            String dimName = dimNames[dimIdx];
            int idx = sparkSchema.fieldIndex(dimName);
            try (Dimension dim = TileDBWriteSchema.toDimension(ctx, dimName, dimIdx, sparkFields[idx].dataType(), options)) {
                domain.addDimension(dim);
            }
        }
        // set domain
        arraySchema.setDomain(domain);
        // add attributes
        for (StructField field : sparkFields) {
            // skip over dims
            if (Arrays.stream(dimNames).anyMatch(field.name()::equals)) {
                continue;
            }
            try (Attribute attr = TileDBWriteSchema.toAttribute(ctx, field, options)) {
                arraySchema.addAttribute(attr);
            }
        }
        // set schema tile / layouts and remaining attributes
        Optional<Layout> schemaCellOrder = options.getSchemaCellOrder();
        if (schemaCellOrder.isPresent()) {
            arraySchema.setCellOrder(schemaCellOrder.get());
        }
        Optional<Layout> schemaTileOrder = options.getSchemaTileOrder();
        if (schemaTileOrder.isPresent()) {
            arraySchema.setTileOrder(schemaTileOrder.get());
        }
        // schema filters
        Optional<List<Pair<String, Integer>>> coordsFilters = options.getSchemaCoordsFilterList();
        if (coordsFilters.isPresent()) {
            try (FilterList filterList = TileDBWriteSchema.createTileDBFilterList(ctx, coordsFilters.get())) {
                arraySchema.setCoodsFilterList(filterList);
            }
        }
        Optional<List<Pair<String, Integer>>> offsetsFilters = options.getSchemaOffsetsFilterList();
        if (coordsFilters.isPresent()) {
            try (FilterList filterList = TileDBWriteSchema.createTileDBFilterList(ctx, offsetsFilters.get())) {
                arraySchema.setOffsetsFilterList(filterList);
            }
        }
        // set capacity
        Optional<Long> schemaCapacity = options.getSchemaCapacity();
        if (schemaCapacity.isPresent()) {
            arraySchema.setCapacity(schemaCapacity.get());
        }
        // set allows dups
        if (options.getSchemaAllowDups().isPresent() && options.getSchemaAllowDups().get())
            arraySchema.setAllowDups(1);
        arraySchema.check();
        Array.create(uri.toString(), arraySchema);
    }
}
Also used : Attribute(io.tiledb.java.api.Attribute) FilterList(io.tiledb.java.api.FilterList) Dimension(io.tiledb.java.api.Dimension) ArrayType(io.tiledb.java.api.ArrayType) ArraySchema(io.tiledb.java.api.ArraySchema) StructField(org.apache.spark.sql.types.StructField) Layout(io.tiledb.java.api.Layout) FilterList(io.tiledb.java.api.FilterList) List(java.util.List) Domain(io.tiledb.java.api.Domain)

Aggregations

Domain (io.tiledb.java.api.Domain)2 List (java.util.List)2 Array (io.tiledb.java.api.Array)1 ArraySchema (io.tiledb.java.api.ArraySchema)1 ArrayType (io.tiledb.java.api.ArrayType)1 Attribute (io.tiledb.java.api.Attribute)1 Context (io.tiledb.java.api.Context)1 Dimension (io.tiledb.java.api.Dimension)1 FilterList (io.tiledb.java.api.FilterList)1 Layout (io.tiledb.java.api.Layout)1 Pair (io.tiledb.java.api.Pair)1 TileDBError (io.tiledb.java.api.TileDBError)1 ArrayList (java.util.ArrayList)1 TaskContext (org.apache.spark.TaskContext)1 InputPartition (org.apache.spark.sql.connector.read.InputPartition)1 Filter (org.apache.spark.sql.sources.Filter)1 StructField (org.apache.spark.sql.types.StructField)1