Search in sources :

Example 16 with GranularitySpec

use of org.apache.druid.segment.indexing.granularity.GranularitySpec in project druid by druid-io.

the class IndexTask method collectIntervalsAndShardSpecs.

private Map<Interval, Optional<HyperLogLogCollector>> collectIntervalsAndShardSpecs(ObjectMapper jsonMapper, IndexIngestionSpec ingestionSchema, InputSource inputSource, File tmpDir, GranularitySpec granularitySpec, @Nonnull PartitionsSpec partitionsSpec, boolean determineIntervals) throws IOException {
    final Map<Interval, Optional<HyperLogLogCollector>> hllCollectors = new TreeMap<>(Comparators.intervalsByStartThenEnd());
    final Granularity queryGranularity = granularitySpec.getQueryGranularity();
    final Predicate<InputRow> rowFilter = inputRow -> {
        if (inputRow == null) {
            return false;
        }
        if (determineIntervals) {
            return true;
        }
        final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
        return optInterval.isPresent();
    };
    try (final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(tmpDir, ingestionSchema.getDataSchema(), inputSource, inputSource.needsFormat() ? getInputFormat(ingestionSchema) : null, rowFilter, determinePartitionsMeters, determinePartitionsParseExceptionHandler)) {
        while (inputRowIterator.hasNext()) {
            final InputRow inputRow = inputRowIterator.next();
            final Interval interval;
            if (determineIntervals) {
                interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp());
            } else {
                final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
                // this interval must exist since it passed the rowFilter
                assert optInterval.isPresent();
                interval = optInterval.get();
            }
            if (partitionsSpec.needsDeterminePartitions(false)) {
                hllCollectors.computeIfAbsent(interval, intv -> Optional.of(HyperLogLogCollector.makeLatestCollector()));
                List<Object> groupKey = Rows.toGroupKey(queryGranularity.bucketStart(inputRow.getTimestampFromEpoch()), inputRow);
                hllCollectors.get(interval).get().add(HASH_FUNCTION.hashBytes(jsonMapper.writeValueAsBytes(groupKey)).asBytes());
            } else {
                // we don't need to determine partitions but we still need to determine intervals, so add an Optional.absent()
                // for the interval and don't instantiate a HLL collector
                hllCollectors.putIfAbsent(interval, Optional.absent());
            }
            determinePartitionsMeters.incrementProcessed();
        }
    }
    // These metrics are reported in generateAndPublishSegments()
    if (determinePartitionsMeters.getThrownAway() > 0) {
        log.warn("Unable to find a matching interval for [%,d] events", determinePartitionsMeters.getThrownAway());
    }
    if (determinePartitionsMeters.getUnparseable() > 0) {
        log.warn("Unable to parse [%,d] events", determinePartitionsMeters.getUnparseable());
    }
    return hllCollectors;
}
Also used : TaskReport(org.apache.druid.indexing.common.TaskReport) TaskToolbox(org.apache.druid.indexing.common.TaskToolbox) BatchAppenderatorDriver(org.apache.druid.segment.realtime.appenderator.BatchAppenderatorDriver) JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) Comparators(org.apache.druid.java.util.common.guava.Comparators) Produces(javax.ws.rs.Produces) IndexSpec(org.apache.druid.segment.IndexSpec) FireDepartmentMetrics(org.apache.druid.segment.realtime.FireDepartmentMetrics) IngestionState(org.apache.druid.indexer.IngestionState) CompletePartitionAnalysis(org.apache.druid.indexing.common.task.batch.partition.CompletePartitionAnalysis) MediaType(javax.ws.rs.core.MediaType) JodaUtils(org.apache.druid.java.util.common.JodaUtils) TaskActionClient(org.apache.druid.indexing.common.actions.TaskActionClient) Optional(com.google.common.base.Optional) SegmentTransactionalInsertAction(org.apache.druid.indexing.common.actions.SegmentTransactionalInsertAction) FiniteFirehoseFactory(org.apache.druid.data.input.FiniteFirehoseFactory) Map(java.util.Map) IAE(org.apache.druid.java.util.common.IAE) LinearPartitionAnalysis(org.apache.druid.indexing.common.task.batch.partition.LinearPartitionAnalysis) Property(org.apache.druid.indexer.Property) InputSourceSampler(org.apache.druid.indexing.overlord.sampler.InputSourceSampler) InputFormat(org.apache.druid.data.input.InputFormat) IngestionStatsAndErrorsTaskReportData(org.apache.druid.indexing.common.IngestionStatsAndErrorsTaskReportData) Set(java.util.Set) ISE(org.apache.druid.java.util.common.ISE) TaskRealtimeMetricsMonitorBuilder(org.apache.druid.indexing.common.TaskRealtimeMetricsMonitorBuilder) InputRow(org.apache.druid.data.input.InputRow) BaseAppenderatorDriver(org.apache.druid.segment.realtime.appenderator.BaseAppenderatorDriver) FirehoseFactoryToInputSourceAdaptor(org.apache.druid.data.input.FirehoseFactoryToInputSourceAdaptor) Granularity(org.apache.druid.java.util.common.granularity.Granularity) ParseExceptionHandler(org.apache.druid.segment.incremental.ParseExceptionHandler) AppenderatorConfig(org.apache.druid.segment.realtime.appenderator.AppenderatorConfig) GET(javax.ws.rs.GET) HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) Rows(org.apache.druid.data.input.Rows) SegmentWriteOutMediumFactory(org.apache.druid.segment.writeout.SegmentWriteOutMediumFactory) TaskStatus(org.apache.druid.indexer.TaskStatus) ArrayList(java.util.ArrayList) Interval(org.joda.time.Interval) HttpServletRequest(javax.servlet.http.HttpServletRequest) UOE(org.apache.druid.java.util.common.UOE) PartitionsSpec(org.apache.druid.indexer.partitions.PartitionsSpec) Nullable(javax.annotation.Nullable) FirehoseFactory(org.apache.druid.data.input.FirehoseFactory) IndexMerger(org.apache.druid.segment.IndexMerger) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) Throwables(com.google.common.base.Throwables) Include(com.fasterxml.jackson.annotation.JsonInclude.Include) PartialHashSegmentGenerateTask(org.apache.druid.indexing.common.task.batch.parallel.PartialHashSegmentGenerateTask) IOException(java.io.IOException) FireDepartment(org.apache.druid.segment.realtime.FireDepartment) File(java.io.File) ExecutionException(java.util.concurrent.ExecutionException) TreeMap(java.util.TreeMap) AppendableIndexSpec(org.apache.druid.segment.incremental.AppendableIndexSpec) Preconditions(com.google.common.base.Preconditions) DataSchema(org.apache.druid.segment.indexing.DataSchema) ArbitraryGranularitySpec(org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec) AuthorizerMapper(org.apache.druid.server.security.AuthorizerMapper) Path(javax.ws.rs.Path) HashPartitionAnalysis(org.apache.druid.indexing.common.task.batch.partition.HashPartitionAnalysis) PartitionAnalysis(org.apache.druid.indexing.common.task.batch.partition.PartitionAnalysis) TimeoutException(java.util.concurrent.TimeoutException) MonotonicNonNull(org.checkerframework.checker.nullness.qual.MonotonicNonNull) ChatHandler(org.apache.druid.segment.realtime.firehose.ChatHandler) QueryParam(javax.ws.rs.QueryParam) DefaultIndexTaskInputRowIteratorBuilder(org.apache.druid.indexing.common.task.batch.parallel.iterator.DefaultIndexTaskInputRowIteratorBuilder) DynamicPartitionsSpec(org.apache.druid.indexer.partitions.DynamicPartitionsSpec) CloseableIterator(org.apache.druid.java.util.common.parsers.CloseableIterator) Context(javax.ws.rs.core.Context) Predicate(java.util.function.Predicate) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) StringUtils(org.apache.druid.java.util.common.StringUtils) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) InputRowParser(org.apache.druid.data.input.impl.InputRowParser) RealtimeIOConfig(org.apache.druid.segment.indexing.RealtimeIOConfig) Action(org.apache.druid.server.security.Action) IngestionSpec(org.apache.druid.segment.indexing.IngestionSpec) Objects(java.util.Objects) List(java.util.List) Response(javax.ws.rs.core.Response) DataSegment(org.apache.druid.timeline.DataSegment) HashFunction(com.google.common.hash.HashFunction) Logger(org.apache.druid.java.util.common.logger.Logger) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) Hashing(com.google.common.hash.Hashing) HashMap(java.util.HashMap) RowIngestionMeters(org.apache.druid.segment.incremental.RowIngestionMeters) Function(java.util.function.Function) TuningConfig(org.apache.druid.segment.indexing.TuningConfig) TaskRealtimeMetricsMonitor(org.apache.druid.indexing.common.stats.TaskRealtimeMetricsMonitor) JsonTypeName(com.fasterxml.jackson.annotation.JsonTypeName) InputSource(org.apache.druid.data.input.InputSource) ImmutableList(com.google.common.collect.ImmutableList) SegmentsAndCommitMetadata(org.apache.druid.segment.realtime.appenderator.SegmentsAndCommitMetadata) Appenderator(org.apache.druid.segment.realtime.appenderator.Appenderator) Nonnull(javax.annotation.Nonnull) ParseExceptionReport(org.apache.druid.segment.incremental.ParseExceptionReport) BatchIOConfig(org.apache.druid.segment.indexing.BatchIOConfig) SecondaryPartitionType(org.apache.druid.indexer.partitions.SecondaryPartitionType) Period(org.joda.time.Period) HyperLogLogCollector(org.apache.druid.hll.HyperLogLogCollector) TransactionalSegmentPublisher(org.apache.druid.segment.realtime.appenderator.TransactionalSegmentPublisher) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) CircularBuffer(org.apache.druid.utils.CircularBuffer) TimeUnit(java.util.concurrent.TimeUnit) Checks(org.apache.druid.indexer.Checks) IngestionStatsAndErrorsTaskReport(org.apache.druid.indexing.common.IngestionStatsAndErrorsTaskReport) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) JsonInclude(com.fasterxml.jackson.annotation.JsonInclude) Collections(java.util.Collections) Optional(com.google.common.base.Optional) InputRow(org.apache.druid.data.input.InputRow) TreeMap(java.util.TreeMap) Granularity(org.apache.druid.java.util.common.granularity.Granularity) Interval(org.joda.time.Interval)

Example 17 with GranularitySpec

use of org.apache.druid.segment.indexing.granularity.GranularitySpec in project druid by druid-io.

the class PartialDimensionCardinalityTask method runTask.

@Override
public TaskStatus runTask(TaskToolbox toolbox) throws Exception {
    DataSchema dataSchema = ingestionSchema.getDataSchema();
    GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
    ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
    HashedPartitionsSpec partitionsSpec = (HashedPartitionsSpec) tuningConfig.getPartitionsSpec();
    Preconditions.checkNotNull(partitionsSpec, "partitionsSpec required in tuningConfig");
    InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
    InputFormat inputFormat = inputSource.needsFormat() ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) : null;
    final RowIngestionMeters buildSegmentsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
    final ParseExceptionHandler parseExceptionHandler = new ParseExceptionHandler(buildSegmentsMeters, tuningConfig.isLogParseExceptions(), tuningConfig.getMaxParseExceptions(), tuningConfig.getMaxSavedParseExceptions());
    final boolean determineIntervals = granularitySpec.inputIntervals().isEmpty();
    try (final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(toolbox.getIndexingTmpDir(), dataSchema, inputSource, inputFormat, determineIntervals ? Objects::nonNull : AbstractBatchIndexTask.defaultRowFilter(granularitySpec), buildSegmentsMeters, parseExceptionHandler)) {
        Map<Interval, byte[]> cardinalities = determineCardinalities(inputRowIterator, granularitySpec);
        sendReport(toolbox, new DimensionCardinalityReport(getId(), cardinalities));
    }
    return TaskStatus.success(getId());
}
Also used : HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) InputSource(org.apache.druid.data.input.InputSource) DataSchema(org.apache.druid.segment.indexing.DataSchema) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) InputFormat(org.apache.druid.data.input.InputFormat) ParseExceptionHandler(org.apache.druid.segment.incremental.ParseExceptionHandler) InputRow(org.apache.druid.data.input.InputRow) RowIngestionMeters(org.apache.druid.segment.incremental.RowIngestionMeters) Interval(org.joda.time.Interval)

Example 18 with GranularitySpec

use of org.apache.druid.segment.indexing.granularity.GranularitySpec in project druid by druid-io.

the class PartialDimensionDistributionTask method runTask.

@Override
public TaskStatus runTask(TaskToolbox toolbox) throws Exception {
    DataSchema dataSchema = ingestionSchema.getDataSchema();
    GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
    ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
    DimensionRangePartitionsSpec partitionsSpec = (DimensionRangePartitionsSpec) tuningConfig.getPartitionsSpec();
    Preconditions.checkNotNull(partitionsSpec, "partitionsSpec required in tuningConfig");
    final List<String> partitionDimensions = partitionsSpec.getPartitionDimensions();
    Preconditions.checkArgument(partitionDimensions != null && !partitionDimensions.isEmpty(), "partitionDimension required in partitionsSpec");
    boolean isAssumeGrouped = partitionsSpec.isAssumeGrouped();
    InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
    InputFormat inputFormat = inputSource.needsFormat() ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) : null;
    final RowIngestionMeters buildSegmentsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
    final ParseExceptionHandler parseExceptionHandler = new ParseExceptionHandler(buildSegmentsMeters, tuningConfig.isLogParseExceptions(), tuningConfig.getMaxParseExceptions(), tuningConfig.getMaxSavedParseExceptions());
    final boolean determineIntervals = granularitySpec.inputIntervals().isEmpty();
    try (final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(toolbox.getIndexingTmpDir(), dataSchema, inputSource, inputFormat, determineIntervals ? Objects::nonNull : AbstractBatchIndexTask.defaultRowFilter(granularitySpec), buildSegmentsMeters, parseExceptionHandler);
        HandlingInputRowIterator iterator = new RangePartitionIndexTaskInputRowIteratorBuilder(partitionDimensions, SKIP_NULL).delegate(inputRowIterator).granularitySpec(granularitySpec).build()) {
        Map<Interval, StringDistribution> distribution = determineDistribution(iterator, granularitySpec, partitionDimensions, isAssumeGrouped);
        sendReport(toolbox, new DimensionDistributionReport(getId(), distribution));
    }
    return TaskStatus.success(getId());
}
Also used : InputSource(org.apache.druid.data.input.InputSource) StringDistribution(org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution) DimensionRangePartitionsSpec(org.apache.druid.indexer.partitions.DimensionRangePartitionsSpec) HandlingInputRowIterator(org.apache.druid.data.input.HandlingInputRowIterator) DataSchema(org.apache.druid.segment.indexing.DataSchema) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) InputFormat(org.apache.druid.data.input.InputFormat) ParseExceptionHandler(org.apache.druid.segment.incremental.ParseExceptionHandler) InputRow(org.apache.druid.data.input.InputRow) RangePartitionIndexTaskInputRowIteratorBuilder(org.apache.druid.indexing.common.task.batch.parallel.iterator.RangePartitionIndexTaskInputRowIteratorBuilder) RowIngestionMeters(org.apache.druid.segment.incremental.RowIngestionMeters) Interval(org.joda.time.Interval)

Example 19 with GranularitySpec

use of org.apache.druid.segment.indexing.granularity.GranularitySpec in project druid by druid-io.

the class IndexTaskInputRowIteratorBuilderTestingFactory method createGranularitySpec.

static GranularitySpec createGranularitySpec(DateTime timestamp, Optional<Interval> bucketIntervalOpt) {
    GranularitySpec granularitySpec = EasyMock.mock(GranularitySpec.class);
    EasyMock.expect(granularitySpec.bucketInterval(timestamp)).andStubReturn(bucketIntervalOpt);
    EasyMock.replay(granularitySpec);
    return granularitySpec;
}
Also used : GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec)

Example 20 with GranularitySpec

use of org.apache.druid.segment.indexing.granularity.GranularitySpec in project druid by druid-io.

the class HashPartitionTaskKillTest method createTestTask.

private ParallelIndexSupervisorTask createTestTask(@Nullable TimestampSpec timestampSpec, @Nullable DimensionsSpec dimensionsSpec, @Nullable InputFormat inputFormat, @Nullable ParseSpec parseSpec, Interval interval, File inputDir, String filter, PartitionsSpec partitionsSpec, int maxNumConcurrentSubTasks, boolean appendToExisting, boolean useInputFormatApi, int succeedsBeforeFailing) {
    GranularitySpec granularitySpec = new UniformGranularitySpec(SEGMENT_GRANULARITY, Granularities.MINUTE, interval == null ? null : Collections.singletonList(interval));
    ParallelIndexTuningConfig tuningConfig = newTuningConfig(partitionsSpec, maxNumConcurrentSubTasks, !appendToExisting);
    final ParallelIndexIngestionSpec ingestionSpec;
    if (useInputFormatApi) {
        Preconditions.checkArgument(parseSpec == null);
        ParallelIndexIOConfig ioConfig = new ParallelIndexIOConfig(null, new LocalInputSource(inputDir, filter), inputFormat, appendToExisting, null);
        ingestionSpec = new ParallelIndexIngestionSpec(new DataSchema(DATASOURCE, timestampSpec, dimensionsSpec, new AggregatorFactory[] { new LongSumAggregatorFactory("val", "val") }, granularitySpec, null), ioConfig, tuningConfig);
    } else {
        Preconditions.checkArgument(inputFormat == null);
        ParallelIndexIOConfig ioConfig = new ParallelIndexIOConfig(new LocalFirehoseFactory(inputDir, filter, null), appendToExisting);
        // noinspection unchecked
        ingestionSpec = new ParallelIndexIngestionSpec(new DataSchema("dataSource", getObjectMapper().convertValue(new StringInputRowParser(parseSpec, null), Map.class), new AggregatorFactory[] { new LongSumAggregatorFactory("val", "val") }, granularitySpec, null, getObjectMapper()), ioConfig, tuningConfig);
    }
    return new ParallelIndexSupervisorTaskTest(null, null, null, ingestionSpec, null, Collections.emptyMap(), succeedsBeforeFailing);
}
Also used : DataSchema(org.apache.druid.segment.indexing.DataSchema) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) StringInputRowParser(org.apache.druid.data.input.impl.StringInputRowParser) LocalFirehoseFactory(org.apache.druid.segment.realtime.firehose.LocalFirehoseFactory) Map(java.util.Map) LocalInputSource(org.apache.druid.data.input.impl.LocalInputSource)

Aggregations

GranularitySpec (org.apache.druid.segment.indexing.granularity.GranularitySpec)32 DataSchema (org.apache.druid.segment.indexing.DataSchema)21 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)15 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)14 Test (org.junit.Test)14 UniformGranularitySpec (org.apache.druid.segment.indexing.granularity.UniformGranularitySpec)13 InputFormat (org.apache.druid.data.input.InputFormat)11 InputRow (org.apache.druid.data.input.InputRow)11 InputSource (org.apache.druid.data.input.InputSource)11 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)11 LongSumAggregatorFactory (org.apache.druid.query.aggregation.LongSumAggregatorFactory)11 TransformSpec (org.apache.druid.segment.transform.TransformSpec)9 InitializedNullHandlingTest (org.apache.druid.testing.InitializedNullHandlingTest)9 Interval (org.joda.time.Interval)9 Map (java.util.Map)8 SamplerResponse (org.apache.druid.client.indexing.SamplerResponse)8 SamplerResponseRow (org.apache.druid.client.indexing.SamplerResponse.SamplerResponseRow)8 RecordSupplierInputSource (org.apache.druid.indexing.seekablestream.RecordSupplierInputSource)8 CsvInputFormat (org.apache.druid.data.input.impl.CsvInputFormat)7 InlineInputSource (org.apache.druid.data.input.impl.InlineInputSource)7