use of org.apache.druid.data.input.InputSource in project druid by druid-io.
the class SinglePhaseParallelIndexTaskRunner method newTaskSpec.
@VisibleForTesting
SubTaskSpec<SinglePhaseSubTask> newTaskSpec(InputSplit split) {
final FirehoseFactory firehoseFactory;
final InputSource inputSource;
if (baseInputSource instanceof FirehoseFactoryToInputSourceAdaptor) {
firehoseFactory = ((FirehoseFactoryToInputSourceAdaptor) baseInputSource).getFirehoseFactory().withSplit(split);
inputSource = null;
} else {
firehoseFactory = null;
inputSource = baseInputSource.withSplit(split);
}
final Map<String, Object> subtaskContext = new HashMap<>(getContext());
return new SinglePhaseSubTaskSpec(getBaseSubtaskSpecName() + "_" + getAndIncrementNextSpecId(), getGroupId(), getTaskId(), new ParallelIndexIngestionSpec(ingestionSchema.getDataSchema(), new ParallelIndexIOConfig(firehoseFactory, inputSource, ingestionSchema.getIOConfig().getInputFormat(), ingestionSchema.getIOConfig().isAppendToExisting(), ingestionSchema.getIOConfig().isDropExisting()), ingestionSchema.getTuningConfig()), subtaskContext, split);
}
use of org.apache.druid.data.input.InputSource in project druid by druid-io.
the class PartialDimensionCardinalityTask method runTask.
@Override
public TaskStatus runTask(TaskToolbox toolbox) throws Exception {
DataSchema dataSchema = ingestionSchema.getDataSchema();
GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
HashedPartitionsSpec partitionsSpec = (HashedPartitionsSpec) tuningConfig.getPartitionsSpec();
Preconditions.checkNotNull(partitionsSpec, "partitionsSpec required in tuningConfig");
InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
InputFormat inputFormat = inputSource.needsFormat() ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) : null;
final RowIngestionMeters buildSegmentsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
final ParseExceptionHandler parseExceptionHandler = new ParseExceptionHandler(buildSegmentsMeters, tuningConfig.isLogParseExceptions(), tuningConfig.getMaxParseExceptions(), tuningConfig.getMaxSavedParseExceptions());
final boolean determineIntervals = granularitySpec.inputIntervals().isEmpty();
try (final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(toolbox.getIndexingTmpDir(), dataSchema, inputSource, inputFormat, determineIntervals ? Objects::nonNull : AbstractBatchIndexTask.defaultRowFilter(granularitySpec), buildSegmentsMeters, parseExceptionHandler)) {
Map<Interval, byte[]> cardinalities = determineCardinalities(inputRowIterator, granularitySpec);
sendReport(toolbox, new DimensionCardinalityReport(getId(), cardinalities));
}
return TaskStatus.success(getId());
}
use of org.apache.druid.data.input.InputSource in project druid by druid-io.
the class PartialDimensionDistributionTask method runTask.
@Override
public TaskStatus runTask(TaskToolbox toolbox) throws Exception {
DataSchema dataSchema = ingestionSchema.getDataSchema();
GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
DimensionRangePartitionsSpec partitionsSpec = (DimensionRangePartitionsSpec) tuningConfig.getPartitionsSpec();
Preconditions.checkNotNull(partitionsSpec, "partitionsSpec required in tuningConfig");
final List<String> partitionDimensions = partitionsSpec.getPartitionDimensions();
Preconditions.checkArgument(partitionDimensions != null && !partitionDimensions.isEmpty(), "partitionDimension required in partitionsSpec");
boolean isAssumeGrouped = partitionsSpec.isAssumeGrouped();
InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
InputFormat inputFormat = inputSource.needsFormat() ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) : null;
final RowIngestionMeters buildSegmentsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
final ParseExceptionHandler parseExceptionHandler = new ParseExceptionHandler(buildSegmentsMeters, tuningConfig.isLogParseExceptions(), tuningConfig.getMaxParseExceptions(), tuningConfig.getMaxSavedParseExceptions());
final boolean determineIntervals = granularitySpec.inputIntervals().isEmpty();
try (final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(toolbox.getIndexingTmpDir(), dataSchema, inputSource, inputFormat, determineIntervals ? Objects::nonNull : AbstractBatchIndexTask.defaultRowFilter(granularitySpec), buildSegmentsMeters, parseExceptionHandler);
HandlingInputRowIterator iterator = new RangePartitionIndexTaskInputRowIteratorBuilder(partitionDimensions, SKIP_NULL).delegate(inputRowIterator).granularitySpec(granularitySpec).build()) {
Map<Interval, StringDistribution> distribution = determineDistribution(iterator, granularitySpec, partitionDimensions, isAssumeGrouped);
sendReport(toolbox, new DimensionDistributionReport(getId(), distribution));
}
return TaskStatus.success(getId());
}
use of org.apache.druid.data.input.InputSource in project druid by druid-io.
the class PartialSegmentGenerateTask method runTask.
@Override
public final TaskStatus runTask(TaskToolbox toolbox) throws Exception {
final InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
final ParallelIndexSupervisorTaskClient taskClient = toolbox.getSupervisorTaskClientFactory().build(new ClientBasedTaskInfoProvider(toolbox.getIndexingServiceClient()), getId(), // always use a single http thread
1, ingestionSchema.getTuningConfig().getChatHandlerTimeout(), ingestionSchema.getTuningConfig().getChatHandlerNumRetries());
final List<DataSegment> segments = generateSegments(toolbox, taskClient, inputSource, toolbox.getIndexingTmpDir());
taskClient.report(supervisorTaskId, createGeneratedPartitionsReport(toolbox, segments));
return TaskStatus.success(getId());
}
use of org.apache.druid.data.input.InputSource in project druid by druid-io.
the class SinglePhaseSubTask method runTask.
@Override
public TaskStatus runTask(final TaskToolbox toolbox) {
try {
if (missingIntervalsInOverwriteMode) {
LOG.warn("Intervals are missing in granularitySpec while this task is potentially overwriting existing segments. " + "Forced to use timeChunk lock.");
}
this.authorizerMapper = toolbox.getAuthorizerMapper();
toolbox.getChatHandlerProvider().register(getId(), this, false);
rowIngestionMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
parseExceptionHandler = new ParseExceptionHandler(rowIngestionMeters, ingestionSchema.getTuningConfig().isLogParseExceptions(), ingestionSchema.getTuningConfig().getMaxParseExceptions(), ingestionSchema.getTuningConfig().getMaxSavedParseExceptions());
final InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
final ParallelIndexSupervisorTaskClient taskClient = toolbox.getSupervisorTaskClientFactory().build(new ClientBasedTaskInfoProvider(toolbox.getIndexingServiceClient()), getId(), // always use a single http thread
1, ingestionSchema.getTuningConfig().getChatHandlerTimeout(), ingestionSchema.getTuningConfig().getChatHandlerNumRetries());
ingestionState = IngestionState.BUILD_SEGMENTS;
final Set<DataSegment> pushedSegments = generateAndPushSegments(toolbox, taskClient, inputSource, toolbox.getIndexingTmpDir());
// Find inputSegments overshadowed by pushedSegments
final Set<DataSegment> allSegments = new HashSet<>(getTaskLockHelper().getLockedExistingSegments());
allSegments.addAll(pushedSegments);
final VersionedIntervalTimeline<String, DataSegment> timeline = VersionedIntervalTimeline.forSegments(allSegments);
final Set<DataSegment> oldSegments = FluentIterable.from(timeline.findFullyOvershadowed()).transformAndConcat(TimelineObjectHolder::getObject).transform(PartitionChunk::getObject).toSet();
Map<String, TaskReport> taskReport = getTaskCompletionReports();
taskClient.report(supervisorTaskId, new PushedSegmentsReport(getId(), oldSegments, pushedSegments, taskReport));
toolbox.getTaskReportFileWriter().write(getId(), taskReport);
return TaskStatus.success(getId());
} catch (Exception e) {
LOG.error(e, "Encountered exception in parallel sub task.");
errorMsg = Throwables.getStackTraceAsString(e);
toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports());
return TaskStatus.failure(getId(), errorMsg);
} finally {
toolbox.getChatHandlerProvider().unregister(getId());
}
}
Aggregations