Search in sources :

Example 41 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class IndexTask method runTask.

@Override
public TaskStatus runTask(final TaskToolbox toolbox) {
    try {
        log.debug("Found chat handler of class[%s]", toolbox.getChatHandlerProvider().getClass().getName());
        if (toolbox.getChatHandlerProvider().get(getId()).isPresent()) {
            // This is a workaround for ParallelIndexSupervisorTask to avoid double registering when it runs in the
            // sequential mode. See ParallelIndexSupervisorTask.runSequential().
            // Note that all HTTP endpoints are not available in this case. This works only for
            // ParallelIndexSupervisorTask because it doesn't support APIs for live ingestion reports.
            log.warn("Chat handler is already registered. Skipping chat handler registration.");
        } else {
            toolbox.getChatHandlerProvider().register(getId(), this, false);
        }
        this.authorizerMapper = toolbox.getAuthorizerMapper();
        this.determinePartitionsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
        this.buildSegmentsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
        this.determinePartitionsParseExceptionHandler = new ParseExceptionHandler(determinePartitionsMeters, ingestionSchema.getTuningConfig().isLogParseExceptions(), ingestionSchema.getTuningConfig().getMaxParseExceptions(), ingestionSchema.getTuningConfig().getMaxSavedParseExceptions());
        this.buildSegmentsParseExceptionHandler = new ParseExceptionHandler(buildSegmentsMeters, ingestionSchema.getTuningConfig().isLogParseExceptions(), ingestionSchema.getTuningConfig().getMaxParseExceptions(), ingestionSchema.getTuningConfig().getMaxSavedParseExceptions());
        final boolean determineIntervals = ingestionSchema.getDataSchema().getGranularitySpec().inputIntervals().isEmpty();
        final InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
        final File tmpDir = toolbox.getIndexingTmpDir();
        ingestionState = IngestionState.DETERMINE_PARTITIONS;
        // Initialize maxRowsPerSegment and maxTotalRows lazily
        final IndexTuningConfig tuningConfig = ingestionSchema.tuningConfig;
        final PartitionsSpec partitionsSpec = tuningConfig.getGivenOrDefaultPartitionsSpec();
        final PartitionAnalysis partitionAnalysis = determineShardSpecs(toolbox, inputSource, tmpDir, partitionsSpec);
        final List<Interval> allocateIntervals = new ArrayList<>(partitionAnalysis.getAllIntervalsToIndex());
        final DataSchema dataSchema;
        if (determineIntervals) {
            final boolean gotLocks = determineLockGranularityAndTryLock(toolbox.getTaskActionClient(), allocateIntervals, ingestionSchema.getIOConfig());
            if (!gotLocks) {
                throw new ISE("Failed to get locks for intervals[%s]", allocateIntervals);
            }
            dataSchema = ingestionSchema.getDataSchema().withGranularitySpec(ingestionSchema.getDataSchema().getGranularitySpec().withIntervals(JodaUtils.condenseIntervals(allocateIntervals)));
        } else {
            dataSchema = ingestionSchema.getDataSchema();
        }
        ingestionState = IngestionState.BUILD_SEGMENTS;
        return generateAndPublishSegments(toolbox, dataSchema, inputSource, tmpDir, partitionAnalysis);
    } catch (Exception e) {
        log.error(e, "Encountered exception in %s.", ingestionState);
        errorMsg = Throwables.getStackTraceAsString(e);
        toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports());
        return TaskStatus.failure(getId(), errorMsg);
    } finally {
        toolbox.getChatHandlerProvider().unregister(getId());
    }
}
Also used : InputSource(org.apache.druid.data.input.InputSource) CompletePartitionAnalysis(org.apache.druid.indexing.common.task.batch.partition.CompletePartitionAnalysis) LinearPartitionAnalysis(org.apache.druid.indexing.common.task.batch.partition.LinearPartitionAnalysis) HashPartitionAnalysis(org.apache.druid.indexing.common.task.batch.partition.HashPartitionAnalysis) PartitionAnalysis(org.apache.druid.indexing.common.task.batch.partition.PartitionAnalysis) ArrayList(java.util.ArrayList) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) TimeoutException(java.util.concurrent.TimeoutException) DataSchema(org.apache.druid.segment.indexing.DataSchema) PartitionsSpec(org.apache.druid.indexer.partitions.PartitionsSpec) DynamicPartitionsSpec(org.apache.druid.indexer.partitions.DynamicPartitionsSpec) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) ParseExceptionHandler(org.apache.druid.segment.incremental.ParseExceptionHandler) ISE(org.apache.druid.java.util.common.ISE) File(java.io.File) Interval(org.joda.time.Interval)

Example 42 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class PartialDimensionCardinalityTask method runTask.

@Override
public TaskStatus runTask(TaskToolbox toolbox) throws Exception {
    DataSchema dataSchema = ingestionSchema.getDataSchema();
    GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
    ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
    HashedPartitionsSpec partitionsSpec = (HashedPartitionsSpec) tuningConfig.getPartitionsSpec();
    Preconditions.checkNotNull(partitionsSpec, "partitionsSpec required in tuningConfig");
    InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
    InputFormat inputFormat = inputSource.needsFormat() ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) : null;
    final RowIngestionMeters buildSegmentsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
    final ParseExceptionHandler parseExceptionHandler = new ParseExceptionHandler(buildSegmentsMeters, tuningConfig.isLogParseExceptions(), tuningConfig.getMaxParseExceptions(), tuningConfig.getMaxSavedParseExceptions());
    final boolean determineIntervals = granularitySpec.inputIntervals().isEmpty();
    try (final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(toolbox.getIndexingTmpDir(), dataSchema, inputSource, inputFormat, determineIntervals ? Objects::nonNull : AbstractBatchIndexTask.defaultRowFilter(granularitySpec), buildSegmentsMeters, parseExceptionHandler)) {
        Map<Interval, byte[]> cardinalities = determineCardinalities(inputRowIterator, granularitySpec);
        sendReport(toolbox, new DimensionCardinalityReport(getId(), cardinalities));
    }
    return TaskStatus.success(getId());
}
Also used : HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) InputSource(org.apache.druid.data.input.InputSource) DataSchema(org.apache.druid.segment.indexing.DataSchema) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) InputFormat(org.apache.druid.data.input.InputFormat) ParseExceptionHandler(org.apache.druid.segment.incremental.ParseExceptionHandler) InputRow(org.apache.druid.data.input.InputRow) RowIngestionMeters(org.apache.druid.segment.incremental.RowIngestionMeters) Interval(org.joda.time.Interval)

Example 43 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class PartialDimensionDistributionTask method runTask.

@Override
public TaskStatus runTask(TaskToolbox toolbox) throws Exception {
    DataSchema dataSchema = ingestionSchema.getDataSchema();
    GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
    ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
    DimensionRangePartitionsSpec partitionsSpec = (DimensionRangePartitionsSpec) tuningConfig.getPartitionsSpec();
    Preconditions.checkNotNull(partitionsSpec, "partitionsSpec required in tuningConfig");
    final List<String> partitionDimensions = partitionsSpec.getPartitionDimensions();
    Preconditions.checkArgument(partitionDimensions != null && !partitionDimensions.isEmpty(), "partitionDimension required in partitionsSpec");
    boolean isAssumeGrouped = partitionsSpec.isAssumeGrouped();
    InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
    InputFormat inputFormat = inputSource.needsFormat() ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) : null;
    final RowIngestionMeters buildSegmentsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
    final ParseExceptionHandler parseExceptionHandler = new ParseExceptionHandler(buildSegmentsMeters, tuningConfig.isLogParseExceptions(), tuningConfig.getMaxParseExceptions(), tuningConfig.getMaxSavedParseExceptions());
    final boolean determineIntervals = granularitySpec.inputIntervals().isEmpty();
    try (final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(toolbox.getIndexingTmpDir(), dataSchema, inputSource, inputFormat, determineIntervals ? Objects::nonNull : AbstractBatchIndexTask.defaultRowFilter(granularitySpec), buildSegmentsMeters, parseExceptionHandler);
        HandlingInputRowIterator iterator = new RangePartitionIndexTaskInputRowIteratorBuilder(partitionDimensions, SKIP_NULL).delegate(inputRowIterator).granularitySpec(granularitySpec).build()) {
        Map<Interval, StringDistribution> distribution = determineDistribution(iterator, granularitySpec, partitionDimensions, isAssumeGrouped);
        sendReport(toolbox, new DimensionDistributionReport(getId(), distribution));
    }
    return TaskStatus.success(getId());
}
Also used : InputSource(org.apache.druid.data.input.InputSource) StringDistribution(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution) DimensionRangePartitionsSpec(org.apache.druid.indexer.partitions.DimensionRangePartitionsSpec) HandlingInputRowIterator(org.apache.druid.data.input.HandlingInputRowIterator) DataSchema(org.apache.druid.segment.indexing.DataSchema) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) InputFormat(org.apache.druid.data.input.InputFormat) ParseExceptionHandler(org.apache.druid.segment.incremental.ParseExceptionHandler) InputRow(org.apache.druid.data.input.InputRow) RangePartitionIndexTaskInputRowIteratorBuilder(org.apache.druid.indexing.common.task.batch.parallel.iterator.RangePartitionIndexTaskInputRowIteratorBuilder) RowIngestionMeters(org.apache.druid.segment.incremental.RowIngestionMeters) Interval(org.joda.time.Interval)

Example 44 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class HashPartitionTaskKillTest method createTestTask.

private ParallelIndexSupervisorTask createTestTask(@Nullable TimestampSpec timestampSpec, @Nullable DimensionsSpec dimensionsSpec, @Nullable InputFormat inputFormat, @Nullable ParseSpec parseSpec, Interval interval, File inputDir, String filter, PartitionsSpec partitionsSpec, int maxNumConcurrentSubTasks, boolean appendToExisting, boolean useInputFormatApi, int succeedsBeforeFailing) {
    GranularitySpec granularitySpec = new UniformGranularitySpec(SEGMENT_GRANULARITY, Granularities.MINUTE, interval == null ? null : Collections.singletonList(interval));
    ParallelIndexTuningConfig tuningConfig = newTuningConfig(partitionsSpec, maxNumConcurrentSubTasks, !appendToExisting);
    final ParallelIndexIngestionSpec ingestionSpec;
    if (useInputFormatApi) {
        Preconditions.checkArgument(parseSpec == null);
        ParallelIndexIOConfig ioConfig = new ParallelIndexIOConfig(null, new LocalInputSource(inputDir, filter), inputFormat, appendToExisting, null);
        ingestionSpec = new ParallelIndexIngestionSpec(new DataSchema(DATASOURCE, timestampSpec, dimensionsSpec, new AggregatorFactory[] { new LongSumAggregatorFactory("val", "val") }, granularitySpec, null), ioConfig, tuningConfig);
    } else {
        Preconditions.checkArgument(inputFormat == null);
        ParallelIndexIOConfig ioConfig = new ParallelIndexIOConfig(new LocalFirehoseFactory(inputDir, filter, null), appendToExisting);
        // noinspection unchecked
        ingestionSpec = new ParallelIndexIngestionSpec(new DataSchema("dataSource", getObjectMapper().convertValue(new StringInputRowParser(parseSpec, null), Map.class), new AggregatorFactory[] { new LongSumAggregatorFactory("val", "val") }, granularitySpec, null, getObjectMapper()), ioConfig, tuningConfig);
    }
    return new ParallelIndexSupervisorTaskTest(null, null, null, ingestionSpec, null, Collections.emptyMap(), succeedsBeforeFailing);
}
Also used : DataSchema(org.apache.druid.segment.indexing.DataSchema) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) StringInputRowParser(org.apache.druid.data.input.impl.StringInputRowParser) LocalFirehoseFactory(org.apache.druid.segment.realtime.firehose.LocalFirehoseFactory) Map(java.util.Map) LocalInputSource(org.apache.druid.data.input.impl.LocalInputSource)

Example 45 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class PartialRangeSegmentGenerateTaskTest method requiresGranularitySpecInputIntervals.

@Test
public void requiresGranularitySpecInputIntervals() {
    exception.expect(IllegalArgumentException.class);
    exception.expectMessage("Missing intervals in granularitySpec");
    DataSchema dataSchema = ParallelIndexTestingFactory.createDataSchema(Collections.emptyList());
    new PartialRangeSegmentGenerateTaskBuilder().dataSchema(dataSchema).build();
}
Also used : DataSchema(org.apache.druid.segment.indexing.DataSchema) Test(org.junit.Test)

Aggregations

DataSchema (org.apache.druid.segment.indexing.DataSchema)80 UniformGranularitySpec (org.apache.druid.segment.indexing.granularity.UniformGranularitySpec)49 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)45 Test (org.junit.Test)44 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)32 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)25 LongSumAggregatorFactory (org.apache.druid.query.aggregation.LongSumAggregatorFactory)22 GranularitySpec (org.apache.druid.segment.indexing.granularity.GranularitySpec)19 InputSource (org.apache.druid.data.input.InputSource)17 InitializedNullHandlingTest (org.apache.druid.testing.InitializedNullHandlingTest)17 File (java.io.File)16 Map (java.util.Map)15 InputFormat (org.apache.druid.data.input.InputFormat)15 CountAggregatorFactory (org.apache.druid.query.aggregation.CountAggregatorFactory)15 SamplerResponse (org.apache.druid.client.indexing.SamplerResponse)14 SamplerResponseRow (org.apache.druid.client.indexing.SamplerResponse.SamplerResponseRow)13 CsvInputFormat (org.apache.druid.data.input.impl.CsvInputFormat)13 Interval (org.joda.time.Interval)13 ArrayList (java.util.ArrayList)12 JsonInputFormat (org.apache.druid.data.input.impl.JsonInputFormat)12