Search in sources :

Example 16 with InputSource

use of org.apache.druid.data.input.InputSource in project druid by druid-io.

the class SinglePhaseParallelIndexTaskRunner method newTaskSpec.

@VisibleForTesting
SubTaskSpec<SinglePhaseSubTask> newTaskSpec(InputSplit split) {
    final FirehoseFactory firehoseFactory;
    final InputSource inputSource;
    if (baseInputSource instanceof FirehoseFactoryToInputSourceAdaptor) {
        firehoseFactory = ((FirehoseFactoryToInputSourceAdaptor) baseInputSource).getFirehoseFactory().withSplit(split);
        inputSource = null;
    } else {
        firehoseFactory = null;
        inputSource = baseInputSource.withSplit(split);
    }
    final Map<String, Object> subtaskContext = new HashMap<>(getContext());
    return new SinglePhaseSubTaskSpec(getBaseSubtaskSpecName() + "_" + getAndIncrementNextSpecId(), getGroupId(), getTaskId(), new ParallelIndexIngestionSpec(ingestionSchema.getDataSchema(), new ParallelIndexIOConfig(firehoseFactory, inputSource, ingestionSchema.getIOConfig().getInputFormat(), ingestionSchema.getIOConfig().isAppendToExisting(), ingestionSchema.getIOConfig().isDropExisting()), ingestionSchema.getTuningConfig()), subtaskContext, split);
}
Also used : InputSource(org.apache.druid.data.input.InputSource) SplittableInputSource(org.apache.druid.data.input.impl.SplittableInputSource) FirehoseFactory(org.apache.druid.data.input.FirehoseFactory) HashMap(java.util.HashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) MutableObject(org.apache.commons.lang3.mutable.MutableObject) FirehoseFactoryToInputSourceAdaptor(org.apache.druid.data.input.FirehoseFactoryToInputSourceAdaptor) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 17 with InputSource

use of org.apache.druid.data.input.InputSource in project druid by druid-io.

the class PartialDimensionCardinalityTask method runTask.

@Override
public TaskStatus runTask(TaskToolbox toolbox) throws Exception {
    DataSchema dataSchema = ingestionSchema.getDataSchema();
    GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
    ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
    HashedPartitionsSpec partitionsSpec = (HashedPartitionsSpec) tuningConfig.getPartitionsSpec();
    Preconditions.checkNotNull(partitionsSpec, "partitionsSpec required in tuningConfig");
    InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
    InputFormat inputFormat = inputSource.needsFormat() ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) : null;
    final RowIngestionMeters buildSegmentsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
    final ParseExceptionHandler parseExceptionHandler = new ParseExceptionHandler(buildSegmentsMeters, tuningConfig.isLogParseExceptions(), tuningConfig.getMaxParseExceptions(), tuningConfig.getMaxSavedParseExceptions());
    final boolean determineIntervals = granularitySpec.inputIntervals().isEmpty();
    try (final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(toolbox.getIndexingTmpDir(), dataSchema, inputSource, inputFormat, determineIntervals ? Objects::nonNull : AbstractBatchIndexTask.defaultRowFilter(granularitySpec), buildSegmentsMeters, parseExceptionHandler)) {
        Map<Interval, byte[]> cardinalities = determineCardinalities(inputRowIterator, granularitySpec);
        sendReport(toolbox, new DimensionCardinalityReport(getId(), cardinalities));
    }
    return TaskStatus.success(getId());
}
Also used : HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) InputSource(org.apache.druid.data.input.InputSource) DataSchema(org.apache.druid.segment.indexing.DataSchema) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) InputFormat(org.apache.druid.data.input.InputFormat) ParseExceptionHandler(org.apache.druid.segment.incremental.ParseExceptionHandler) InputRow(org.apache.druid.data.input.InputRow) RowIngestionMeters(org.apache.druid.segment.incremental.RowIngestionMeters) Interval(org.joda.time.Interval)

Example 18 with InputSource

use of org.apache.druid.data.input.InputSource in project druid by druid-io.

the class PartialDimensionDistributionTask method runTask.

@Override
public TaskStatus runTask(TaskToolbox toolbox) throws Exception {
    DataSchema dataSchema = ingestionSchema.getDataSchema();
    GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
    ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
    DimensionRangePartitionsSpec partitionsSpec = (DimensionRangePartitionsSpec) tuningConfig.getPartitionsSpec();
    Preconditions.checkNotNull(partitionsSpec, "partitionsSpec required in tuningConfig");
    final List<String> partitionDimensions = partitionsSpec.getPartitionDimensions();
    Preconditions.checkArgument(partitionDimensions != null && !partitionDimensions.isEmpty(), "partitionDimension required in partitionsSpec");
    boolean isAssumeGrouped = partitionsSpec.isAssumeGrouped();
    InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
    InputFormat inputFormat = inputSource.needsFormat() ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) : null;
    final RowIngestionMeters buildSegmentsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
    final ParseExceptionHandler parseExceptionHandler = new ParseExceptionHandler(buildSegmentsMeters, tuningConfig.isLogParseExceptions(), tuningConfig.getMaxParseExceptions(), tuningConfig.getMaxSavedParseExceptions());
    final boolean determineIntervals = granularitySpec.inputIntervals().isEmpty();
    try (final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(toolbox.getIndexingTmpDir(), dataSchema, inputSource, inputFormat, determineIntervals ? Objects::nonNull : AbstractBatchIndexTask.defaultRowFilter(granularitySpec), buildSegmentsMeters, parseExceptionHandler);
        HandlingInputRowIterator iterator = new RangePartitionIndexTaskInputRowIteratorBuilder(partitionDimensions, SKIP_NULL).delegate(inputRowIterator).granularitySpec(granularitySpec).build()) {
        Map<Interval, StringDistribution> distribution = determineDistribution(iterator, granularitySpec, partitionDimensions, isAssumeGrouped);
        sendReport(toolbox, new DimensionDistributionReport(getId(), distribution));
    }
    return TaskStatus.success(getId());
}
Also used : InputSource(org.apache.druid.data.input.InputSource) StringDistribution(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution) DimensionRangePartitionsSpec(org.apache.druid.indexer.partitions.DimensionRangePartitionsSpec) HandlingInputRowIterator(org.apache.druid.data.input.HandlingInputRowIterator) DataSchema(org.apache.druid.segment.indexing.DataSchema) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) InputFormat(org.apache.druid.data.input.InputFormat) ParseExceptionHandler(org.apache.druid.segment.incremental.ParseExceptionHandler) InputRow(org.apache.druid.data.input.InputRow) RangePartitionIndexTaskInputRowIteratorBuilder(org.apache.druid.indexing.common.task.batch.parallel.iterator.RangePartitionIndexTaskInputRowIteratorBuilder) RowIngestionMeters(org.apache.druid.segment.incremental.RowIngestionMeters) Interval(org.joda.time.Interval)

Example 19 with InputSource

use of org.apache.druid.data.input.InputSource in project druid by druid-io.

the class PartialSegmentGenerateTask method runTask.

@Override
public final TaskStatus runTask(TaskToolbox toolbox) throws Exception {
    final InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
    final ParallelIndexSupervisorTaskClient taskClient = toolbox.getSupervisorTaskClientFactory().build(new ClientBasedTaskInfoProvider(toolbox.getIndexingServiceClient()), getId(), // always use a single http thread
    1, ingestionSchema.getTuningConfig().getChatHandlerTimeout(), ingestionSchema.getTuningConfig().getChatHandlerNumRetries());
    final List<DataSegment> segments = generateSegments(toolbox, taskClient, inputSource, toolbox.getIndexingTmpDir());
    taskClient.report(supervisorTaskId, createGeneratedPartitionsReport(toolbox, segments));
    return TaskStatus.success(getId());
}
Also used : InputSource(org.apache.druid.data.input.InputSource) ClientBasedTaskInfoProvider(org.apache.druid.indexing.common.task.ClientBasedTaskInfoProvider) DataSegment(org.apache.druid.timeline.DataSegment)

Example 20 with InputSource

use of org.apache.druid.data.input.InputSource in project druid by druid-io.

the class SinglePhaseSubTask method runTask.

@Override
public TaskStatus runTask(final TaskToolbox toolbox) {
    try {
        if (missingIntervalsInOverwriteMode) {
            LOG.warn("Intervals are missing in granularitySpec while this task is potentially overwriting existing segments. " + "Forced to use timeChunk lock.");
        }
        this.authorizerMapper = toolbox.getAuthorizerMapper();
        toolbox.getChatHandlerProvider().register(getId(), this, false);
        rowIngestionMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
        parseExceptionHandler = new ParseExceptionHandler(rowIngestionMeters, ingestionSchema.getTuningConfig().isLogParseExceptions(), ingestionSchema.getTuningConfig().getMaxParseExceptions(), ingestionSchema.getTuningConfig().getMaxSavedParseExceptions());
        final InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
        final ParallelIndexSupervisorTaskClient taskClient = toolbox.getSupervisorTaskClientFactory().build(new ClientBasedTaskInfoProvider(toolbox.getIndexingServiceClient()), getId(), // always use a single http thread
        1, ingestionSchema.getTuningConfig().getChatHandlerTimeout(), ingestionSchema.getTuningConfig().getChatHandlerNumRetries());
        ingestionState = IngestionState.BUILD_SEGMENTS;
        final Set<DataSegment> pushedSegments = generateAndPushSegments(toolbox, taskClient, inputSource, toolbox.getIndexingTmpDir());
        // Find inputSegments overshadowed by pushedSegments
        final Set<DataSegment> allSegments = new HashSet<>(getTaskLockHelper().getLockedExistingSegments());
        allSegments.addAll(pushedSegments);
        final VersionedIntervalTimeline<String, DataSegment> timeline = VersionedIntervalTimeline.forSegments(allSegments);
        final Set<DataSegment> oldSegments = FluentIterable.from(timeline.findFullyOvershadowed()).transformAndConcat(TimelineObjectHolder::getObject).transform(PartitionChunk::getObject).toSet();
        Map<String, TaskReport> taskReport = getTaskCompletionReports();
        taskClient.report(supervisorTaskId, new PushedSegmentsReport(getId(), oldSegments, pushedSegments, taskReport));
        toolbox.getTaskReportFileWriter().write(getId(), taskReport);
        return TaskStatus.success(getId());
    } catch (Exception e) {
        LOG.error(e, "Encountered exception in parallel sub task.");
        errorMsg = Throwables.getStackTraceAsString(e);
        toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports());
        return TaskStatus.failure(getId(), errorMsg);
    } finally {
        toolbox.getChatHandlerProvider().unregister(getId());
    }
}
Also used : InputSource(org.apache.druid.data.input.InputSource) TaskReport(org.apache.druid.indexing.common.TaskReport) IngestionStatsAndErrorsTaskReport(org.apache.druid.indexing.common.IngestionStatsAndErrorsTaskReport) DataSegment(org.apache.druid.timeline.DataSegment) TimeoutException(java.util.concurrent.TimeoutException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) TimelineObjectHolder(org.apache.druid.timeline.TimelineObjectHolder) ParseExceptionHandler(org.apache.druid.segment.incremental.ParseExceptionHandler) ClientBasedTaskInfoProvider(org.apache.druid.indexing.common.task.ClientBasedTaskInfoProvider) HashSet(java.util.HashSet)

Aggregations

InputSource (org.apache.druid.data.input.InputSource)31 InputFormat (org.apache.druid.data.input.InputFormat)20 Test (org.junit.Test)19 DataSchema (org.apache.druid.segment.indexing.DataSchema)18 SamplerResponse (org.apache.druid.client.indexing.SamplerResponse)14 InlineInputSource (org.apache.druid.data.input.impl.InlineInputSource)14 InitializedNullHandlingTest (org.apache.druid.testing.InitializedNullHandlingTest)14 SamplerResponseRow (org.apache.druid.client.indexing.SamplerResponse.SamplerResponseRow)13 CsvInputFormat (org.apache.druid.data.input.impl.CsvInputFormat)13 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)13 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)13 RecordSupplierInputSource (org.apache.druid.indexing.seekablestream.RecordSupplierInputSource)13 JsonInputFormat (org.apache.druid.data.input.impl.JsonInputFormat)12 GranularitySpec (org.apache.druid.segment.indexing.granularity.GranularitySpec)11 UniformGranularitySpec (org.apache.druid.segment.indexing.granularity.UniformGranularitySpec)8 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)7 LongSumAggregatorFactory (org.apache.druid.query.aggregation.LongSumAggregatorFactory)7 IOException (java.io.IOException)6 ParseExceptionHandler (org.apache.druid.segment.incremental.ParseExceptionHandler)6 File (java.io.File)5