Search in sources :

Example 1 with HandlingInputRowIterator

use of org.apache.druid.data.input.HandlingInputRowIterator in project druid by druid-io.

the class PartialDimensionDistributionTask method runTask.

@Override
public TaskStatus runTask(TaskToolbox toolbox) throws Exception {
    DataSchema dataSchema = ingestionSchema.getDataSchema();
    GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
    ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
    DimensionRangePartitionsSpec partitionsSpec = (DimensionRangePartitionsSpec) tuningConfig.getPartitionsSpec();
    Preconditions.checkNotNull(partitionsSpec, "partitionsSpec required in tuningConfig");
    final List<String> partitionDimensions = partitionsSpec.getPartitionDimensions();
    Preconditions.checkArgument(partitionDimensions != null && !partitionDimensions.isEmpty(), "partitionDimension required in partitionsSpec");
    boolean isAssumeGrouped = partitionsSpec.isAssumeGrouped();
    InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
    InputFormat inputFormat = inputSource.needsFormat() ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) : null;
    final RowIngestionMeters buildSegmentsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
    final ParseExceptionHandler parseExceptionHandler = new ParseExceptionHandler(buildSegmentsMeters, tuningConfig.isLogParseExceptions(), tuningConfig.getMaxParseExceptions(), tuningConfig.getMaxSavedParseExceptions());
    final boolean determineIntervals = granularitySpec.inputIntervals().isEmpty();
    try (final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(toolbox.getIndexingTmpDir(), dataSchema, inputSource, inputFormat, determineIntervals ? Objects::nonNull : AbstractBatchIndexTask.defaultRowFilter(granularitySpec), buildSegmentsMeters, parseExceptionHandler);
        HandlingInputRowIterator iterator = new RangePartitionIndexTaskInputRowIteratorBuilder(partitionDimensions, SKIP_NULL).delegate(inputRowIterator).granularitySpec(granularitySpec).build()) {
        Map<Interval, StringDistribution> distribution = determineDistribution(iterator, granularitySpec, partitionDimensions, isAssumeGrouped);
        sendReport(toolbox, new DimensionDistributionReport(getId(), distribution));
    }
    return TaskStatus.success(getId());
}
Also used : InputSource(org.apache.druid.data.input.InputSource) StringDistribution(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution) DimensionRangePartitionsSpec(org.apache.druid.indexer.partitions.DimensionRangePartitionsSpec) HandlingInputRowIterator(org.apache.druid.data.input.HandlingInputRowIterator) DataSchema(org.apache.druid.segment.indexing.DataSchema) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) InputFormat(org.apache.druid.data.input.InputFormat) ParseExceptionHandler(org.apache.druid.segment.incremental.ParseExceptionHandler) InputRow(org.apache.druid.data.input.InputRow) RangePartitionIndexTaskInputRowIteratorBuilder(org.apache.druid.indexing.common.task.batch.parallel.iterator.RangePartitionIndexTaskInputRowIteratorBuilder) RowIngestionMeters(org.apache.druid.segment.incremental.RowIngestionMeters) Interval(org.joda.time.Interval)

Example 2 with HandlingInputRowIterator

use of org.apache.druid.data.input.HandlingInputRowIterator in project druid by druid-io.

the class InputSourceProcessor method process.

/**
 * This method opens the given {@link InputSource} and processes data via {@link InputSourceReader}.
 * All read data is consumed by {@link BatchAppenderatorDriver} which creates new segments.
 * All created segments are pushed when all input data is processed successfully.
 *
 * @return {@link SegmentsAndCommitMetadata} for the pushed segments.
 */
public static SegmentsAndCommitMetadata process(DataSchema dataSchema, BatchAppenderatorDriver driver, PartitionsSpec partitionsSpec, InputSource inputSource, @Nullable InputFormat inputFormat, File tmpDir, SequenceNameFunction sequenceNameFunction, IndexTaskInputRowIteratorBuilder inputRowIteratorBuilder, RowIngestionMeters buildSegmentsMeters, ParseExceptionHandler parseExceptionHandler, long pushTimeout) throws IOException, InterruptedException, ExecutionException, TimeoutException {
    @Nullable final DynamicPartitionsSpec dynamicPartitionsSpec = partitionsSpec instanceof DynamicPartitionsSpec ? (DynamicPartitionsSpec) partitionsSpec : null;
    final GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
    try (final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(tmpDir, dataSchema, inputSource, inputFormat, AbstractBatchIndexTask.defaultRowFilter(granularitySpec), buildSegmentsMeters, parseExceptionHandler);
        final HandlingInputRowIterator iterator = inputRowIteratorBuilder.delegate(inputRowIterator).granularitySpec(granularitySpec).build()) {
        while (iterator.hasNext()) {
            final InputRow inputRow = iterator.next();
            if (inputRow == null) {
                continue;
            }
            // IndexTaskInputRowIteratorBuilder.absentBucketIntervalConsumer() ensures the interval will be present here
            Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
            @SuppressWarnings("OptionalGetWithoutIsPresent") final Interval interval = optInterval.get();
            final String sequenceName = sequenceNameFunction.getSequenceName(interval, inputRow);
            final AppenderatorDriverAddResult addResult = driver.add(inputRow, sequenceName);
            if (addResult.isOk()) {
                // incremental segment publishment is allowed only when rollup doesn't have to be perfect.
                if (dynamicPartitionsSpec != null) {
                    final boolean isPushRequired = addResult.isPushRequired(dynamicPartitionsSpec.getMaxRowsPerSegment(), dynamicPartitionsSpec.getMaxTotalRowsOr(DynamicPartitionsSpec.DEFAULT_MAX_TOTAL_ROWS));
                    if (isPushRequired) {
                        // There can be some segments waiting for being pushed even though no more rows will be added to them
                        // in the future.
                        // If those segments are not pushed here, the remaining available space in appenderator will be kept
                        // small which could lead to smaller segments.
                        final SegmentsAndCommitMetadata pushed = driver.pushAllAndClear(pushTimeout);
                        LOG.debugSegments(pushed.getSegments(), "Pushed segments");
                    }
                }
            } else {
                throw new ISE("Failed to add a row with timestamp[%s]", inputRow.getTimestamp());
            }
        }
        final SegmentsAndCommitMetadata pushed = driver.pushAllAndClear(pushTimeout);
        LOG.debugSegments(pushed.getSegments(), "Pushed segments");
        return pushed;
    }
}
Also used : SegmentsAndCommitMetadata(org.apache.druid.segment.realtime.appenderator.SegmentsAndCommitMetadata) HandlingInputRowIterator(org.apache.druid.data.input.HandlingInputRowIterator) AppenderatorDriverAddResult(org.apache.druid.segment.realtime.appenderator.AppenderatorDriverAddResult) DynamicPartitionsSpec(org.apache.druid.indexer.partitions.DynamicPartitionsSpec) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) InputRow(org.apache.druid.data.input.InputRow) ISE(org.apache.druid.java.util.common.ISE) Nullable(javax.annotation.Nullable) Interval(org.joda.time.Interval)

Aggregations

HandlingInputRowIterator (org.apache.druid.data.input.HandlingInputRowIterator)2 InputRow (org.apache.druid.data.input.InputRow)2 GranularitySpec (org.apache.druid.segment.indexing.granularity.GranularitySpec)2 Interval (org.joda.time.Interval)2 Nullable (javax.annotation.Nullable)1 InputFormat (org.apache.druid.data.input.InputFormat)1 InputSource (org.apache.druid.data.input.InputSource)1 DimensionRangePartitionsSpec (org.apache.druid.indexer.partitions.DimensionRangePartitionsSpec)1 DynamicPartitionsSpec (org.apache.druid.indexer.partitions.DynamicPartitionsSpec)1 StringDistribution (org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution)1 RangePartitionIndexTaskInputRowIteratorBuilder (org.apache.druid.indexing.common.task.batch.parallel.iterator.RangePartitionIndexTaskInputRowIteratorBuilder)1 ISE (org.apache.druid.java.util.common.ISE)1 ParseExceptionHandler (org.apache.druid.segment.incremental.ParseExceptionHandler)1 RowIngestionMeters (org.apache.druid.segment.incremental.RowIngestionMeters)1 DataSchema (org.apache.druid.segment.indexing.DataSchema)1 AppenderatorDriverAddResult (org.apache.druid.segment.realtime.appenderator.AppenderatorDriverAddResult)1 SegmentsAndCommitMetadata (org.apache.druid.segment.realtime.appenderator.SegmentsAndCommitMetadata)1