use of org.apache.druid.data.input.HandlingInputRowIterator in project druid by druid-io.
the class PartialDimensionDistributionTask method runTask.
@Override
public TaskStatus runTask(TaskToolbox toolbox) throws Exception {
DataSchema dataSchema = ingestionSchema.getDataSchema();
GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
DimensionRangePartitionsSpec partitionsSpec = (DimensionRangePartitionsSpec) tuningConfig.getPartitionsSpec();
Preconditions.checkNotNull(partitionsSpec, "partitionsSpec required in tuningConfig");
final List<String> partitionDimensions = partitionsSpec.getPartitionDimensions();
Preconditions.checkArgument(partitionDimensions != null && !partitionDimensions.isEmpty(), "partitionDimension required in partitionsSpec");
boolean isAssumeGrouped = partitionsSpec.isAssumeGrouped();
InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
InputFormat inputFormat = inputSource.needsFormat() ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) : null;
final RowIngestionMeters buildSegmentsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
final ParseExceptionHandler parseExceptionHandler = new ParseExceptionHandler(buildSegmentsMeters, tuningConfig.isLogParseExceptions(), tuningConfig.getMaxParseExceptions(), tuningConfig.getMaxSavedParseExceptions());
final boolean determineIntervals = granularitySpec.inputIntervals().isEmpty();
try (final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(toolbox.getIndexingTmpDir(), dataSchema, inputSource, inputFormat, determineIntervals ? Objects::nonNull : AbstractBatchIndexTask.defaultRowFilter(granularitySpec), buildSegmentsMeters, parseExceptionHandler);
HandlingInputRowIterator iterator = new RangePartitionIndexTaskInputRowIteratorBuilder(partitionDimensions, SKIP_NULL).delegate(inputRowIterator).granularitySpec(granularitySpec).build()) {
Map<Interval, StringDistribution> distribution = determineDistribution(iterator, granularitySpec, partitionDimensions, isAssumeGrouped);
sendReport(toolbox, new DimensionDistributionReport(getId(), distribution));
}
return TaskStatus.success(getId());
}
use of org.apache.druid.data.input.HandlingInputRowIterator in project druid by druid-io.
the class InputSourceProcessor method process.
/**
* This method opens the given {@link InputSource} and processes data via {@link InputSourceReader}.
* All read data is consumed by {@link BatchAppenderatorDriver} which creates new segments.
* All created segments are pushed when all input data is processed successfully.
*
* @return {@link SegmentsAndCommitMetadata} for the pushed segments.
*/
public static SegmentsAndCommitMetadata process(DataSchema dataSchema, BatchAppenderatorDriver driver, PartitionsSpec partitionsSpec, InputSource inputSource, @Nullable InputFormat inputFormat, File tmpDir, SequenceNameFunction sequenceNameFunction, IndexTaskInputRowIteratorBuilder inputRowIteratorBuilder, RowIngestionMeters buildSegmentsMeters, ParseExceptionHandler parseExceptionHandler, long pushTimeout) throws IOException, InterruptedException, ExecutionException, TimeoutException {
@Nullable final DynamicPartitionsSpec dynamicPartitionsSpec = partitionsSpec instanceof DynamicPartitionsSpec ? (DynamicPartitionsSpec) partitionsSpec : null;
final GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
try (final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(tmpDir, dataSchema, inputSource, inputFormat, AbstractBatchIndexTask.defaultRowFilter(granularitySpec), buildSegmentsMeters, parseExceptionHandler);
final HandlingInputRowIterator iterator = inputRowIteratorBuilder.delegate(inputRowIterator).granularitySpec(granularitySpec).build()) {
while (iterator.hasNext()) {
final InputRow inputRow = iterator.next();
if (inputRow == null) {
continue;
}
// IndexTaskInputRowIteratorBuilder.absentBucketIntervalConsumer() ensures the interval will be present here
Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
@SuppressWarnings("OptionalGetWithoutIsPresent") final Interval interval = optInterval.get();
final String sequenceName = sequenceNameFunction.getSequenceName(interval, inputRow);
final AppenderatorDriverAddResult addResult = driver.add(inputRow, sequenceName);
if (addResult.isOk()) {
// incremental segment publishment is allowed only when rollup doesn't have to be perfect.
if (dynamicPartitionsSpec != null) {
final boolean isPushRequired = addResult.isPushRequired(dynamicPartitionsSpec.getMaxRowsPerSegment(), dynamicPartitionsSpec.getMaxTotalRowsOr(DynamicPartitionsSpec.DEFAULT_MAX_TOTAL_ROWS));
if (isPushRequired) {
// There can be some segments waiting for being pushed even though no more rows will be added to them
// in the future.
// If those segments are not pushed here, the remaining available space in appenderator will be kept
// small which could lead to smaller segments.
final SegmentsAndCommitMetadata pushed = driver.pushAllAndClear(pushTimeout);
LOG.debugSegments(pushed.getSegments(), "Pushed segments");
}
}
} else {
throw new ISE("Failed to add a row with timestamp[%s]", inputRow.getTimestamp());
}
}
final SegmentsAndCommitMetadata pushed = driver.pushAllAndClear(pushTimeout);
LOG.debugSegments(pushed.getSegments(), "Pushed segments");
return pushed;
}
}
Aggregations